Merge remote-tracking branch 'origin/master' into thilow/SimpleExampleSSM

2017-02-09 12:01:11 +01:00 · 2017-02-09 12:01:11 +01:00 · 98c2fbdcd1
--- a/.gitignore
+++ b/.gitignore
@ -269,3 +269,4 @@ Tutorials/slots.wl
 /packages
 /CNTK.VC.db
 /CNTK.VC.VC.opendb
+.cache
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@ -6,8 +6,6 @@ DataDir = "$RootDir$"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"

-ndlMacros="$ConfigDir$/Macros.ndl"
-
 precision = "float"
 deviceId = "Auto"

@ -88,7 +86,7 @@ Train = {
    SGD = {
        epochSize = 0
        minibatchSize = 256
-        # CNTK weights new gradient by (1-momentum) for unit gain, thus we multiply Caffe's learning rate by (1-momentum)
+        # CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
        learningRatesPerMB = 0.1*25:0.01*25:0.001*25:0.0001*25:0.00001
        momentumPerMB = 0.9
        maxEpochs = 112
--- a/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
@ -17,7 +17,7 @@ from cntk.ops import *
 from cntk.distributed import data_parallel_distributed_learner, Communicator
 from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
 from cntk.blocks import Placeholder, Block
-from cntk.layers import Convolution, Activation, MaxPooling, Dense, Dropout, default_options
+from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
 from cntk.models import Sequential
 from cntk.initializer import normal

@ -32,6 +32,7 @@ image_height = 227
 image_width  = 227
 num_channels = 3  # RGB
 num_classes  = 1000
+model_name   = "AlexNet.model"

 # Create a minibatch source.
 def create_image_mb_source(map_file, is_training, total_number_of_samples):
@ -58,6 +59,7 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize = is_training, 
        epoch_size=total_number_of_samples,
        multithreaded_deserializer = True)

@ -95,21 +97,21 @@ def create_alexnet():
    with default_options(activation=None, pad=True, bias=True):
        z = Sequential([
            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) 
-            Convolution((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'), 
+            Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'), 
            Activation(activation=relu, name='relu1'), 
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
            MaxPooling((3,3), (2,2), name='pool1'),
            
-            Convolution((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), 
+            Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), 
            Activation(activation=relu, name='relu2'), 
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
            MaxPooling((3,3), (2,2), name='pool2'),
            
-            Convolution((3,3), 384, init=normal(0.01), name='conv3'), 
+            Convolution2D((3,3), 384, init=normal(0.01), name='conv3'), 
            Activation(activation=relu, name='relu3'), 
-            Convolution((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), 
+            Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), 
            Activation(activation=relu, name='relu4'), 
-            Convolution((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), 
+            Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), 
            Activation(activation=relu, name='relu5'), 
            MaxPooling((3,3), (2,2), name='pool5'), 
            
@ -123,8 +125,9 @@ def create_alexnet():
            ])(input)

    # loss and metric
-    ce = cross_entropy_with_softmax(z, label_var)
-    pe = classification_error(z, label_var)
+    ce  = cross_entropy_with_softmax(z, label_var)
+    pe  = classification_error(z, label_var)
+    pe5 = classification_error(z, label_var, topN=5)

    log_number_of_parameters(z) ; print()

@ -133,6 +136,7 @@ def create_alexnet():
        'label': label_var,
        'ce' : ce,
        'pe' : pe,
+        'pe5': pe5, 
        'output': z
    }

@ -145,9 +149,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
    l2_reg_weight     = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
    
    # Create learner
+    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency 
    parameter_learner = data_parallel_distributed_learner(
-        cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight),
+        local_learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

@ -155,7 +160,7 @@ def create_trainer(network, epoch_size, num_quantization_bits):
    return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size):
+def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
@ -163,36 +168,27 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(train_source, trainer,
-        cntk.minibatch_size_schedule(minibatch_size), progress_printer, input_map, os.path.join(model_path, "AlexNet_"), epoch_size)
+    training_session = cntk.training_session(
+        training_minibatch_source = train_source, 
+        trainer = trainer,
+        model_inputs_to_mb_source_mapping = input_map, 
+        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
+        progress_printer = progress_printer, 
+#        checkpoint_frequency = epoch_size,
+        checkpoint_filename = os.path.join(model_path, model_name), 
+#        save_all_checkpoints = True,
+        progress_frequency = epoch_size, 
+        cv_source = test_source, 
+        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
+#        cv_frequency = epoch_size,
+        restore = restore)
+
+    # Train all minibatches 
    training_session.train()

-    # process minibatches and evaluate the model
-    metric_numer    = 0
-    metric_denom    = 0
-    minibatch_index = 0
-
-    while True:
-        data = test_source.next_minibatch(minibatch_size, input_map=input_map)
-        if not data: break
-        local_mb_samples=data[network['label']].num_samples
-        metric_numer += trainer.test_minibatch(data) * local_mb_samples
-        metric_denom += local_mb_samples
-        minibatch_index += 1
-
-    fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)
-    progress_printer.end_progress_print(fin_msg)
-
-    print("")
-    print(fin_msg)
-    print("")
-
-    return metric_numer/metric_denom
-
-
 # Train and evaluate the network.
 def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112, 
-                           log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
+                           restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=True):
    _cntk_py.set_computation_network_trace_level(0)

    progress_printer = ProgressPrinter(
@ -207,35 +203,46 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
    trainer = create_trainer(network, epoch_size, num_quantization_bits)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size)
+    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
 

 if __name__=='__main__':
    
    parser = argparse.ArgumentParser()

-    parser.add_argument('-datadir', help='specify the location of your data');
-    parser.add_argument('-logdir', help='specify where the training log will be saved');
-    parser.add_argument('-outputdir',  help='specify where the output model/checkpoint files shall be saved');
+    parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
+    parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
+    parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
+    parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='112')
+    parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='256')
+    parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
+    parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
+    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
+    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)

    args = vars(parser.parse_args())

-    if args['datadir'] != None:
-        data_path = args['datadir']
-
-    if args['logdir'] != None:
-        log_dir = args['logdir']
-
-    if args['outputdir'] != None:
+    if args['outputdir'] is not None:
        model_path = args['outputdir'] + "/models"
+    if args['datadir'] is not None:
+        data_path = args['datadir']
+    if args['logdir'] is not None:
+        log_dir = args['logdir']
+    if args['device'] is not None:
+        cntk.device.set_default_device(cntk.device.gpu(args['device']))

    train_data=os.path.join(data_path, 'train_map.txt')
    test_data=os.path.join(data_path, 'val_map.txt')

-    alexnet_train_and_eval(train_data, test_data, 
-                           num_quantization_bits=32, 
-                           max_epochs=112, 
-                           log_to_file=log_dir, 
-                           num_mbs_per_log=500, 
-                           gen_heartbeat=True)
-    Communicator.finalize()
+    try:
+        alexnet_train_and_eval(train_data, test_data, 
+                               minibatch_size=args['minibatch_size'], 
+                               epoch_size=args['epoch_size'],
+                               num_quantization_bits=args['quantized_bits'],
+                               max_epochs=args['num_epochs'],
+                               restore=not args['restart'],
+                               log_to_file=args['logdir'],
+                               num_mbs_per_log=200,
+                               gen_heartbeat=True)
+    finally:
+        cntk.distributed.Communicator.finalize()    
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@ -45,6 +45,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
        cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
            features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize=train, 
        epoch_size=total_number_of_samples,
        multithreaded_deserializer = True)

@ -105,15 +106,15 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
                                              l2_regularization_weight=l2_reg_weight)

    if block_size != None:
-        learner = cntk.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size)
+        parameter_learner = cntk.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size)
    else:
-        learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
+        parameter_learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)

    # Create trainer
-    return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)
+    return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size):
+def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
@ -125,20 +126,23 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        training_minibatch_source = train_source,
        trainer = trainer,
        model_inputs_to_mb_source_mapping = input_map, 
-        mb_size_schedule = cntk.minibatch_size_schedule(64),
+        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
        progress_printer = progress_printer, 
+#        checkpoint_frequency = epoch_size, 
        checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
+#        save_all_checkpoints = False, 
        progress_frequency=epoch_size,
        cv_source = test_source,
-        cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
-        restore=False)
+        cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
+#        cv_frequency = epoch_size,
+        restore=restore)

    # Train all minibatches 
    training_session.train()

 # Train and evaluate the network.
-def convnet_cifar10_dataaug(train_data, test_data, mean_data, epoch_size=50000, num_quantization_bits=32, 
-                            block_size=3200, warm_up=0, max_epochs=2, log_to_file=None, 
+def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32, 
+                            block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None, 
                            num_mbs_per_log=None, gen_heartbeat=False):
    _cntk_py.set_computation_network_trace_level(0)

@ -154,7 +158,7 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, epoch_size=50000,
    trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up)
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size)
+    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
 

 if __name__=='__main__':
@ -165,20 +169,25 @@ if __name__=='__main__':
    parser.add_argument('-datadir', '--datadir', help='Data directory where the CIFAR dataset is located', required=False, default=data_path)
    parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
    parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
-    parser.add_argument('-e', '--epochs', help='Total number of epochs to train', type=int, required=False, default='160')
+    parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='160')
+    parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='64')
+    parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='50000')
    parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
    parser.add_argument('-a', '--distributed_after', help='Number of samples to train with before running distributed', type=int, required=False, default='0')
    parser.add_argument('-b', '--block_samples', type=int, help="Number of samples per block for block momentum (BM) distributed learner (if 0 BM learner is not used)", required=False, default=None)
+    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)

    args = vars(parser.parse_args())

    if args['outputdir'] is not None:
        model_path = args['outputdir'] + "/models"
-    if args['device'] is not None:
-        cntk.device.set_default_device(cntk.device.gpu(args['device']))
    if args['datadir'] is not None:
        data_path = args['datadir']
+    if args['logdir'] is not None:
+        log_dir = args['logdir']
+    if args['device'] is not None:
+        cntk.device.set_default_device(cntk.device.gpu(args['device']))

    mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')
    train_data=os.path.join(data_path, 'train_map.txt')
@ -186,14 +195,16 @@ if __name__=='__main__':

    try:
        convnet_cifar10_dataaug(train_data, test_data, mean_data, 
-                                epoch_size=50000,
+                                minibatch_size=args['minibatch_size'], 
+                                epoch_size=args['epoch_size'],
                                num_quantization_bits=args['quantized_bits'],
                                block_size=args['block_samples'],
                                warm_up=args['distributed_after'],
-                                max_epochs=args['epochs'],
+                                max_epochs=args['num_epochs'],
+                                restore=not args['restart'],
                                log_to_file=args['logdir'],
-                                num_mbs_per_log=10,
-                                gen_heartbeat=True)
+                                num_mbs_per_log=100,
+                                gen_heartbeat=False)
    finally:
        cntk.distributed.Communicator.finalize()

--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
@ -1,4 +1,4 @@
-# Node: ResNet-50 with ImageNet -- 101 layers bottleneck ResNet for image classification
+# ResNet-101 with ImageNet -- 101 layers bottleneck ResNet for image classification
 # Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385

 command = TrainNetwork:BNStatistics:Eval
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet110_CIFAR10.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet110_CIFAR10.cntk
@ -1,4 +1,4 @@
-# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
+# ResNet110 applied on CIFAR-10 dataset, with data augmentation (translation and flipping).

 command = TrainConvNet:Eval

--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
@ -1,4 +1,4 @@
-# Node: ResNet-50 with ImageNet -- 152 layers bottleneck ResNet for image classification
+# ResNet-152 with ImageNet -- 152 layers bottleneck ResNet for image classification
 # Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385

 command = TrainNetwork:BNStatistics:Eval
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet20_CIFAR10.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet20_CIFAR10.cntk
@ -1,4 +1,4 @@
-# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
+# ResNet20 applied on CIFAR-10 dataset, with data augmentation (translation and flipping).

 command = TrainConvNet:Eval

--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet50_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet50_ImageNet1K.cntk
@ -1,4 +1,4 @@
-# Node: ResNet-50 with ImageNet -- 50 layers bottleneck ResNet for image classification
+# ResNet-50 with ImageNet -- 50 layers bottleneck ResNet for image classification
 # Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385

 command = TrainNetwork:BNStatistics:Eval
--- a/Examples/Image/Classification/VGG/BrainScript/README.md
+++ b/Examples/Image/Classification/VGG/BrainScript/README.md
@ -0,0 +1,19 @@
+# CNTK Examples: Image/Classification/VGG
+
+## BrainScript
+
+### VGG16_ImageNet.cntk
+
+This is the VGG model that contains 16 layers, which was referred as `ConvNet configuration D` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
+
+Run the example from the current folder using:
+
+`cntk configFile=VGG16_ImageNet.cntk`
+
+### VGG19_ImageNet.cntk
+
+This is the VGG model that contains 19 layers, which was referred as `ConvNet configuration E` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
+
+Run the example from the current folder using:
+
+`cntk configFile=VGG19_ImageNet.cntk` 
--- a/Examples/Image/Classification/VGG/BrainScript/VGG16_ImageNet.cntk
+++ b/Examples/Image/Classification/VGG/BrainScript/VGG16_ImageNet.cntk
@ -0,0 +1,167 @@
+# VGG16 with ImageNet -- 16 layers ConvNet for image classification
+# Reference: "Very Deep Convolutional Networks for Large-Scale Image Recognition" https://arxiv.org/abs/1409.1556
+
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+precision = "float"
+deviceId = "Auto"
+
+command = Train:Test
+
+parallelTrain = "true"
+traceLevel = 1
+numMBsToShowResult = 500
+
+modelPath = "$ModelDir$/VGG16"
+stderr = "$OutputDir$/VGG16"
+
+ImageH    = 224
+ImageW    = 224
+ImageC    = 3
+NumLabels = 1000
+
+parallelTrain = true
+hyperCompressMemory = true
+
+################################
+Train = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape  = $ImageH$:$ImageW$:$ImageC$
+        labelDim    = $NumLabels$
+                
+        model = Sequential (
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {128, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {128, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            DenseLayer         {4096, activation=ReLU} : Dropout : 
+            DenseLayer         {4096, activation=ReLU} : Dropout :
+            LinearLayer        {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        featNorm = features - Splice(Constant(104):Constant(117):Constant(124), axis=3)
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (featNorm)
+
+        # loss and error computation
+        ce       = CrossEntropyWithSoftmax  (labels, z)
+        errs     = ClassificationError      (labels, z)
+        top5Errs = ClassificationError      (labels, z, topN=5)  # only used in Eval action
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+    
+    SGD = {
+        epochSize = 0
+        minibatchSize = 128
+        # CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
+        learningRatesPerMB = 0.1*20:0.01*20:0.001*20:0.0001*10:0.00001
+        momentumPerMB = 0.9
+        maxEpochs = 80
+        gradUpdateType = None
+        L2RegWeight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
+        dropoutRate = 0.5
+        
+        # TODO: try less bits?
+        ParallelTrain = {
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 1
+            DataParallelSGD = {
+                gradientBits = 32
+            }
+        }
+        
+        numMBsToShowResult = 250
+    }
+    
+    # Reader
+    reader = {
+        verbosity = 0 ; randomize = true
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/train_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "randomSide" ; sideRatio = 0.4375:0.875 ; jitterType = "uniRatio" } :  # [256, 512] jitter in scale 
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = $NumLabels$ }
+            }
+        })
+    }
+
+    cvreader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/val_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "Center" ; sideRatio = 0.5833333 } :   # 384 crop to 224 
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = $NumLabels$ }
+            }
+        })
+    }    
+}
+
+################################
+Test = {
+    action=test
+    minibatchSize=128
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = false
+
+        deserializers = (
+        {
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file="$DataDir$/val_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop"; cropType = "center"; sideRatio = 0.5833333 } :     # 384 crop to 224
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = 1000}
+            }
+        })
+    }        
+}
--- a/Examples/Image/Classification/VGG/BrainScript/VGG19_ImageNet.cntk
+++ b/Examples/Image/Classification/VGG/BrainScript/VGG19_ImageNet.cntk
@ -0,0 +1,170 @@
+# VGG19 with ImageNet -- 19 layers ConvNet for image classification
+# Reference: "Very Deep Convolutional Networks for Large-Scale Image Recognition" https://arxiv.org/abs/1409.1556
+
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+precision = "float"
+deviceId = "Auto"
+
+command = Train:Test
+
+parallelTrain = "true"
+traceLevel = 1
+numMBsToShowResult = 500
+
+modelPath = "$ModelDir$/VGG19"
+stderr = "$OutputDir$/VGG19"
+
+ImageH    = 224
+ImageW    = 224
+ImageC    = 3
+NumLabels = 1000
+
+parallelTrain = true
+hyperCompressMemory = true
+
+################################
+Train = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape  = $ImageH$:$ImageW$:$ImageC$
+        labelDim    = $NumLabels$
+                
+        model = Sequential (
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {128, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {128, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {512, (3:3), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            DenseLayer         {4096, activation=ReLU} : Dropout : 
+            DenseLayer         {4096, activation=ReLU} : Dropout :
+            LinearLayer        {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        featNorm = features - Splice(Constant(104):Constant(117):Constant(124), axis=3)
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (featNorm)
+
+        # loss and error computation
+        ce       = CrossEntropyWithSoftmax  (labels, z)
+        errs     = ClassificationError      (labels, z)
+        top5Errs = ClassificationError      (labels, z, topN=5)  # only used in Eval action
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+    
+    SGD = {
+        epochSize = 0
+        minibatchSize = 128
+        # CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
+        learningRatesPerMB = 0.1*20:0.01*20:0.001*20:0.0001*10:0.00001
+        momentumPerMB = 0.9
+        maxEpochs = 80
+        gradUpdateType = None
+        L2RegWeight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
+        dropoutRate = 0.5
+        
+        # TODO: try less bits?
+        ParallelTrain = {
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 1
+            DataParallelSGD = {
+                gradientBits = 32
+            }
+        }
+        
+        numMBsToShowResult = 250
+    }
+    
+    # Reader
+    reader = {
+        verbosity = 0 ; randomize = true
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/train_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "randomSide" ; sideRatio = 0.4375:0.875 ; jitterType = "uniRatio" } :  # [256, 512] jitter in scale 
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = $NumLabels$ }
+            }
+        })
+    }
+
+    cvreader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/val_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "Center" ; sideRatio = 0.5833333 } :   # 384 crop to 224 
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = $NumLabels$ }
+            }
+        })
+    }    
+}
+
+################################
+Test = {
+    action=test
+    minibatchSize=128
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = false
+
+        deserializers = (
+        {
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file="$DataDir$/val_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop"; cropType = "center"; sideRatio = 0.5833333 } :     # 384 crop to 224
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = 1000}
+            }
+        })
+    }        
+}
--- a/Examples/Image/Classification/VGG/CreateEvalModel.mel
+++ b/Examples/Image/Classification/VGG/CreateEvalModel.mel
@ -1,7 +0,0 @@
-m1=LoadModel($CurModel$, format=cntk)
-SetDefaultModel(m1)
-
-# Add top-5 error prediction node.
-ErrTop5 = ClassificationError(labels, OutputNodes.z, Const(5), tag = "eval")
-
-SaveModel(m1, $NewModel$, format=cntk)
--- a/Examples/Image/Classification/VGG/ImageNet1K_mean.xml
+++ b/Examples/Image/Classification/VGG/ImageNet1K_mean.xml
--- a/Examples/Image/Classification/VGG/Macros.ndl
+++ b/Examples/Image/Classification/VGG/Macros.ndl
@ -1,55 +0,0 @@
-# Fully-connected layer with ReLU activation.
-DnnReLULayer(inDim, outDim, x, wScale, bValue)
-[
-    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
-    b = Parameter(outDim, init = fixedValue, value = bValue) 
-    t = Times(W, x)
-    z = Plus(t, b)
-    y = RectifiedLinear(z)
-]
-
-# Fully-connected layer with batch normalization and ReLU activation.
-DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
-[
-    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
-    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
-    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
-    m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, v, spatial = false)
-    y = RectifiedLinear(bn)
-]
-
-# Fully-connected layer.
-DnnLayer(inDim, outDim, x, wScale, bValue)
-[
-    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
-    b = Parameter(outDim, init = fixedValue, value = bValue)
-    t = Times(W, x)
-    z = Plus(t, b)
-]
-
-# Convolutional layer with ReLU activation.
-ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
-[
-    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    z = Plus(c, b);
-    y = RectifiedLinear(z);
-]
-
-# Convolutional layer with batch normalization and ReLU activation.
-ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue)
-[
-    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
-    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
-    y = RectifiedLinear(bn);
-]
--- a/Examples/Image/Classification/VGG/Python/README.md
+++ b/Examples/Image/Classification/VGG/Python/README.md
@ -0,0 +1,27 @@
+# CNTK Examples: Image/Classification/VGG
+
+## Python
+
+### VGG16_ImageNet_Distributed.py
+
+This is the VGG model that contains 16 layers, which was referred as `ConvNet configuration D` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
+
+Run the example from the current folder using:
+
+`python VGG16_ImageNet_Distributed.py`
+
+To run it in a distributed manner, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#32-python). For example, the command for distributed training on the same machine (with multiple GPUs) with Windows is:
+
+`mpiexec -n <#workers> python VGG16_ImageNet_Distributed.py`
+
+### VGG19_ImageNet_Distributed.py
+
+This is the VGG model that contains 19 layers, which was referred as `ConvNet configuration E` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
+
+Run the example from the current folder using:
+
+`python VGG19_ImageNet_Distributed.py`
+
+To run it in a distributed manner, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#32-python). For example, the command for distributed training on the same machine (with multiple GPUs) with Windows is:
+
+`mpiexec -n <#workers> python VGG19_ImageNet_Distributed.py` 
--- a/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
@ -0,0 +1,242 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from __future__ import print_function
+import os
+import math
+import argparse
+import numpy as np
+import cntk
+import _cntk_py
+
+from cntk.utils import *
+from cntk.ops import *
+from cntk.distributed import data_parallel_distributed_learner, Communicator
+from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
+from cntk.blocks import Placeholder, Block
+from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
+from cntk.models import Sequential, LayerStack
+from cntk.initializer import normal
+
+# default Paths relative to current python file.
+abs_path   = os.path.dirname(os.path.abspath(__file__))
+data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "ImageNet")
+model_path = os.path.join(abs_path, "Models")
+log_dir = None
+
+# model dimensions
+image_height = 224
+image_width  = 224
+num_channels = 3  # RGB
+num_classes  = 1000
+model_name   = "VGG16.model"
+
+cntk.cntk_py.enable_hyper_memory_compress()
+
+# Create a minibatch source.
+def create_image_mb_source(map_file, is_training, total_number_of_samples):
+    if not os.path.exists(map_file):
+        raise RuntimeError("File '%s' does not exist." %map_file)
+
+    # transformation pipeline for the features has jitter/crop only when training
+    transforms = []
+    if is_training:
+        transforms += [
+            ImageDeserializer.crop(crop_type='randomside', side_ratio='0.4375:0.875', jitter_type='uniratio') # train uses jitter
+        ]
+    else: 
+        transforms += [
+            ImageDeserializer.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter
+        ]
+
+    transforms += [
+        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+    ]
+
+    # deserializer
+    return MinibatchSource(
+        ImageDeserializer(map_file, StreamDefs(
+            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
+            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize = is_training, 
+        epoch_size=total_number_of_samples,
+        multithreaded_deserializer = True)
+
+# Create the network.
+def create_vgg16():
+
+    # Input variables denoting the features and label data
+    feature_var = input_variable((num_channels, image_height, image_width))
+    label_var = input_variable((num_classes))
+
+    # apply model to input
+    # remove mean value 
+    input = minus(feature_var, constant([[[104]], [[117]], [[124]]]), name='mean_removed_input')
+    
+    with default_options(activation=None, pad=True, bias=True):
+        z = Sequential([
+            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) 
+            LayerStack(2, lambda i: [
+                Convolution2D((3,3), 64, name='conv1_{}'.format(i)), 
+                Activation(activation=relu, name='relu1_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool1'),
+
+            LayerStack(2, lambda i: [
+                Convolution2D((3,3), 128, name='conv2_{}'.format(i)), 
+                Activation(activation=relu, name='relu2_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool2'),
+
+            LayerStack(3, lambda i: [
+                Convolution2D((3,3), 256, name='conv3_{}'.format(i)), 
+                Activation(activation=relu, name='relu3_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool3'),
+
+            LayerStack(3, lambda i: [
+                Convolution2D((3,3), 512, name='conv4_{}'.format(i)), 
+                Activation(activation=relu, name='relu4_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool4'),
+
+            LayerStack(3, lambda i: [
+                Convolution2D((3,3), 512, name='conv5_{}'.format(i)), 
+                Activation(activation=relu, name='relu5_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool5'),
+
+            Dense(4096, name='fc6'), 
+            Activation(activation=relu, name='relu6'), 
+            Dropout(0.5, name='drop6'), 
+            Dense(4096, name='fc7'), 
+            Activation(activation=relu, name='relu7'), 
+            Dropout(0.5, name='drop7'),
+            Dense(num_classes, name='fc8')
+            ])(input)
+
+    # loss and metric
+    ce = cross_entropy_with_softmax(z, label_var)
+    pe = classification_error(z, label_var)
+    pe5 = classification_error(z, label_var, topN=5)
+
+    log_number_of_parameters(z) ; print()
+
+    return {
+        'feature': feature_var,
+        'label': label_var,
+        'ce' : ce,
+        'pe' : pe,
+        'pe5': pe5, 
+        'output': z
+    }
+
+# Create trainer
+def create_trainer(network, epoch_size, num_quantization_bits):
+    # Set learning parameters
+    lr_per_mb         = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
+    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
+    mm_schedule       = cntk.learner.momentum_schedule(0.9)
+    l2_reg_weight     = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
+    
+    # Create learner
+    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
+    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency 
+    parameter_learner = data_parallel_distributed_learner(
+        local_learner, 
+        num_quantization_bits=num_quantization_bits,
+        distributed_after=0)
+
+    # Create trainer
+    return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
+
+# Train and test
+def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
+
+    # define mapping from intput streams to network inputs
+    input_map = {
+        network['feature']: train_source.streams.features,
+        network['label']: train_source.streams.labels
+    }
+
+    training_session = cntk.training_session(
+        training_minibatch_source = train_source, 
+        trainer = trainer,
+        model_inputs_to_mb_source_mapping = input_map, 
+        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
+        progress_printer = progress_printer, 
+#        checkpoint_frequency = epoch_size,
+        checkpoint_filename = os.path.join(model_path, model_name), 
+#        save_all_checkpoints = True,
+        progress_frequency = epoch_size, 
+        cv_source = test_source, 
+        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
+#        cv_frequency = epoch_size,
+        restore = restore)
+
+    # Train all minibatches 
+    training_session.train()
+
+# Train and evaluate the network.
+def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80, 
+                         restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
+    _cntk_py.set_computation_network_trace_level(0)
+
+    progress_printer = ProgressPrinter(
+        freq=num_mbs_per_log,
+        tag='Training',
+        log_to_file=log_to_file,
+        rank=Communicator.rank(),
+        gen_heartbeat=gen_heartbeat,
+        num_epochs=max_epochs)
+
+    network = create_vgg16()
+    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
+    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
+    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
+ 
+
+if __name__=='__main__':
+    
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
+    parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
+    parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
+    parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='80')
+    parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='128')
+    parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
+    parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
+    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
+    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
+
+    args = vars(parser.parse_args())
+
+    if args['outputdir'] is not None:
+        model_path = args['outputdir'] + "/models"
+    if args['datadir'] is not None:
+        data_path = args['datadir']
+    if args['logdir'] is not None:
+        log_dir = args['logdir']
+    if args['device'] is not None:
+        cntk.device.set_default_device(cntk.device.gpu(args['device']))
+
+    train_data=os.path.join(data_path, 'train_map.txt')
+    test_data=os.path.join(data_path, 'val_map.txt')
+
+    try:
+        vgg16_train_and_eval(train_data, test_data, 
+                             minibatch_size=args['minibatch_size'], 
+                             epoch_size=args['epoch_size'],
+                             num_quantization_bits=args['quantized_bits'],
+                             max_epochs=args['num_epochs'],
+                             restore=not args['restart'],
+                             log_to_file=args['logdir'],
+                             num_mbs_per_log=200,
+                             gen_heartbeat=True)
+    finally:
+        cntk.distributed.Communicator.finalize()    
--- a/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
@ -0,0 +1,242 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from __future__ import print_function
+import os
+import math
+import argparse
+import numpy as np
+import cntk
+import _cntk_py
+
+from cntk.utils import *
+from cntk.ops import *
+from cntk.distributed import data_parallel_distributed_learner, Communicator
+from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
+from cntk.blocks import Placeholder, Block
+from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
+from cntk.models import Sequential, LayerStack
+from cntk.initializer import normal
+
+# default Paths relative to current python file.
+abs_path   = os.path.dirname(os.path.abspath(__file__))
+data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "ImageNet")
+model_path = os.path.join(abs_path, "Models")
+log_dir = None
+
+# model dimensions
+image_height = 224
+image_width  = 224
+num_channels = 3  # RGB
+num_classes  = 1000
+model_name   = "VGG19.model"
+
+cntk.cntk_py.enable_hyper_memory_compress()
+
+# Create a minibatch source.
+def create_image_mb_source(map_file, is_training, total_number_of_samples):
+    if not os.path.exists(map_file):
+        raise RuntimeError("File '%s' does not exist." %map_file)
+
+    # transformation pipeline for the features has jitter/crop only when training
+    transforms = []
+    if is_training:
+        transforms += [
+            ImageDeserializer.crop(crop_type='randomside', side_ratio='0.4375:0.875', jitter_type='uniratio') # train uses jitter
+        ]
+    else: 
+        transforms += [
+            ImageDeserializer.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter
+        ]
+
+    transforms += [
+        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+    ]
+
+    # deserializer
+    return MinibatchSource(
+        ImageDeserializer(map_file, StreamDefs(
+            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
+            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize = is_training, 
+        epoch_size=total_number_of_samples,
+        multithreaded_deserializer = True)
+
+# Create the network.
+def create_vgg19():
+
+    # Input variables denoting the features and label data
+    feature_var = input_variable((num_channels, image_height, image_width))
+    label_var = input_variable((num_classes))
+
+    # apply model to input
+    # remove mean value 
+    input = minus(feature_var, constant([[[104]], [[117]], [[124]]]), name='mean_removed_input')
+    
+    with default_options(activation=None, pad=True, bias=True):
+        z = Sequential([
+            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) 
+            LayerStack(2, lambda i: [
+                Convolution2D((3,3), 64, name='conv1_{}'.format(i)), 
+                Activation(activation=relu, name='relu1_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool1'),
+
+            LayerStack(2, lambda i: [
+                Convolution2D((3,3), 128, name='conv2_{}'.format(i)), 
+                Activation(activation=relu, name='relu2_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool2'),
+
+            LayerStack(4, lambda i: [
+                Convolution2D((3,3), 256, name='conv3_{}'.format(i)), 
+                Activation(activation=relu, name='relu3_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool3'),
+
+            LayerStack(4, lambda i: [
+                Convolution2D((3,3), 512, name='conv4_{}'.format(i)), 
+                Activation(activation=relu, name='relu4_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool4'),
+
+            LayerStack(4, lambda i: [
+                Convolution2D((3,3), 512, name='conv5_{}'.format(i)), 
+                Activation(activation=relu, name='relu5_{}'.format(i)), 
+            ]),
+            MaxPooling((2,2), (2,2), name='pool5'),
+
+            Dense(4096, name='fc6'), 
+            Activation(activation=relu, name='relu6'), 
+            Dropout(0.5, name='drop6'), 
+            Dense(4096, name='fc7'), 
+            Activation(activation=relu, name='relu7'), 
+            Dropout(0.5, name='drop7'),
+            Dense(num_classes, name='fc8')
+            ])(input)
+
+    # loss and metric
+    ce = cross_entropy_with_softmax(z, label_var)
+    pe = classification_error(z, label_var)
+    pe5 = classification_error(z, label_var, topN=5)
+
+    log_number_of_parameters(z) ; print()
+
+    return {
+        'feature': feature_var,
+        'label': label_var,
+        'ce' : ce,
+        'pe' : pe,
+        'pe5': pe5, 
+        'output': z
+    }
+
+# Create trainer
+def create_trainer(network, epoch_size, num_quantization_bits):
+    # Set learning parameters
+    lr_per_mb         = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
+    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
+    mm_schedule       = cntk.learner.momentum_schedule(0.9)
+    l2_reg_weight     = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
+    
+    # Create learner
+    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
+    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency 
+    parameter_learner = data_parallel_distributed_learner(
+        local_learner, 
+        num_quantization_bits=num_quantization_bits,
+        distributed_after=0)
+
+    # Create trainer
+    return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
+
+# Train and test
+def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
+
+    # define mapping from intput streams to network inputs
+    input_map = {
+        network['feature']: train_source.streams.features,
+        network['label']: train_source.streams.labels
+    }
+
+    training_session = cntk.training_session(
+        training_minibatch_source = train_source, 
+        trainer = trainer,
+        model_inputs_to_mb_source_mapping = input_map, 
+        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
+        progress_printer = progress_printer, 
+#        checkpoint_frequency = epoch_size,
+        checkpoint_filename = os.path.join(model_path, model_name), 
+#        save_all_checkpoints = True,
+        progress_frequency = epoch_size, 
+        cv_source = test_source, 
+        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
+#        cv_frequency = epoch_size,
+        restore = restore)
+
+    # Train all minibatches 
+    training_session.train()
+
+# Train and evaluate the network.
+def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80, 
+                         restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
+    _cntk_py.set_computation_network_trace_level(0)
+
+    progress_printer = ProgressPrinter(
+        freq=num_mbs_per_log,
+        tag='Training',
+        log_to_file=log_to_file,
+        rank=Communicator.rank(),
+        gen_heartbeat=gen_heartbeat,
+        num_epochs=max_epochs)
+
+    network = create_vgg19()
+    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
+    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
+    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
+ 
+
+if __name__=='__main__':
+    
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
+    parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
+    parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
+    parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='80')
+    parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='128')
+    parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
+    parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
+    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
+    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
+
+    args = vars(parser.parse_args())
+
+    if args['outputdir'] is not None:
+        model_path = args['outputdir'] + "/models"
+    if args['datadir'] is not None:
+        data_path = args['datadir']
+    if args['logdir'] is not None:
+        log_dir = args['logdir']
+    if args['device'] is not None:
+        cntk.device.set_default_device(cntk.device.gpu(args['device']))
+
+    train_data=os.path.join(data_path, 'train_map.txt')
+    test_data=os.path.join(data_path, 'val_map.txt')
+
+    try:
+        vgg19_train_and_eval(train_data, test_data, 
+                             minibatch_size=args['minibatch_size'], 
+                             epoch_size=args['epoch_size'],
+                             num_quantization_bits=args['quantized_bits'],
+                             max_epochs=args['num_epochs'],
+                             restore=not args['restart'],
+                             log_to_file=args['logdir'],
+                             num_mbs_per_log=200,
+                             gen_heartbeat=True)
+    finally:
+        cntk.distributed.Communicator.finalize()    
--- a/Examples/Image/Classification/VGG/README.md
+++ b/Examples/Image/Classification/VGG/README.md
@ -0,0 +1,25 @@
+# CNTK Examples: Image/Classification/VGG
+
+## Overview
+
+|Data:     |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) for image classification.
+|:---------|:---
+|Purpose   |This folder contains examples that demonstrate how to use CNTK to define VGG network (https://arxiv.org/abs/1409.1556) for image classification.
+|Network   |VGG.
+|Training  |Stochastic gradient descent with momentum.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+We use the ILSVRC2012 datasets to demonstrate how to train the VGG model which was developed by the [Visual Geometry Group in University of Oxford](http://www.robots.ox.ac.uk/~vgg/research/very_deep/). It won the second place in the ILSVRC-2014 challenge. VGG has been a very popular model for its simple architect and high accuracy. 
+
+ILSVRC2012 datasets are not included in the CNTK distribution. You may obtain it through http://image-net.org.
+
+## Details
+
+We give examples for both Python and BrainScript.
+
+### [Python](./Python)
+
+### [BrainScript](./BrainScript)
--- a/Examples/Image/Classification/VGG/VGG_A.ndl
+++ b/Examples/Image/Classification/VGG/VGG_A.ndl
@ -1,76 +0,0 @@
-load=ndlMacros
-run=DNN
-
-ndlMacros = [
-    ImageW = 224
-    ImageH = 224
-    ImageC = 3
-    LabelDim = 1000
-
-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    labels = Input(LabelDim, tag = label)
-    
-    # Kernels width and height.
-    kW = 3
-    kH = 3
-    # Kernel stride.
-    hs = 1
-    vs = 1
-    
-    # Pooling settings.
-    poolW = 2
-    poolH = 2
-    poolhs = 2
-    poolvs = 2
-    
-    # Initial parameter values.
-    convWScale = 0.01 #7
-    convBValue = 0
-    fc1WScale = 0.01 #8
-    fc1BValue = 0
-    fc2WScale = 0.01 #3.2
-    fc2BValue = 0
-    fc3WScale = 0.01 #3.2
-    fc3BValue = 0
-]
-
-DNN=[
-    cMap1 = 64
-    conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap2 = 128
-    conv2 = ConvReLULayer(pool1, cMap2, 576, kW, kH, hs, vs, convWScale, convBValue)
-   
-    pool2 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap3 = 256
-    conv3 = ConvReLULayer(pool2, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
-    conv4 = ConvReLULayer(conv3, cMap3, 2304, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool3 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap5 = 512
-    conv5 = ConvReLULayer(pool3, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
-    conv6 = ConvReLULayer(conv5, cMap5, 4608, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool4 = MaxPooling(conv6, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap6 = 512
-    conv7 = ConvReLULayer(pool4, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv8 = ConvReLULayer(conv7, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool5 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    hiddenDim = 4096
-    h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
-    h1_d = Dropout(h1)
-    h2 = DnnReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
-    h2_d = Dropout(h2)
-    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
-    
-    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ClassificationError(labels, ol, tag = Eval)
-    OutputNodes = ol
-]
--- a/Examples/Image/Classification/VGG/VGG_A_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_A_ndl_deprecated.cntk
@ -1,109 +0,0 @@
-# Note: This sample uses the deprecated NdlNetworkBuilder.
-#       An updated version using BrainScript is coming soon.
-#       Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
-#
-RootDir = "."
-
-ConfigDir = "$RootDir$"
-DataDir = "$RootDir$"
-OutputDir = "$RootDir$/Output"
-ModelDir = "$OutputDir$/Models"
-
-ndlMacros="$ConfigDir$/Macros.ndl"
-
-precision="float"
-deviceId="Auto"
-
-command=Train:AddTop5Eval:Test
-
-stderr="$OutputDir$/VGG_A"
-traceLevel=1
-numMBsToShowResult=500
-
-Train=[
-    action="train"
-    modelPath="$ModelDir$/VGG_A"
-    traceLevel=1
-
-     NDLNetworkBuilder=[
-        networkDescription="$ConfigDir$/VGG_A.ndl"
-    ]
-    
-    SGD=[
-        epochSize=0
-        minibatchSize=32
-        learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
-        momentumPerMB=0.9
-        maxEpochs=70
-        gradUpdateType="None"
-        L2RegWeight=0.0005
-        dropoutRate=0*5:0.5
-        
-        numMBsToShowResult=10
-    ]
-    
-    reader=[
-        readerType="ImageReader"
-        # Map file which maps images to labels using the following format:
-        # <full path to image><tab><numerical label (0-based class id)>
-        # Example:
-        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
-        file="$ConfigDir$/train_map.txt"
-        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
-        randomize="Auto"
-        features=[
-            # Below are the required parameters.
-            width=224
-            height=224
-            channels=3
-            # Below are the optional parameters.
-            # Possible values: Center, Random. Default: Center
-            cropType="RandomSide"
-            # Horizontal random flip, will be enabled by default because cropType=RandomSide
-            #hflip="true"
-            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
-            sideRatio=0.875
-            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio. Default: None
-            jitterType="UniRatio"
-            # Interpolation to use when scaling image to width x height size.
-            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
-
-AddTop5Eval=[    
-    action="edit"
-    CurModel="$ModelDir$/VGG_A"
-    NewModel="$ModelDir$/VGG_A.Top5"
-    editPath="$ConfigDir$/CreateEvalModel.mel"
-]
-
-Test=[
-    action="test"
-    modelPath="$ModelDir$/VGG_A.Top5"
-    # Set minibatch size for testing.
-    minibatchSize=32
-
-    reader=[
-        readerType="ImageReader"
-        file="$ConfigDir$/val_map.txt"
-        randomize="None"
-        features=[
-            width=224
-            height=224
-            channels=3
-            cropType="Center"
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
--- a/Examples/Image/Classification/VGG/VGG_E.ndl
+++ b/Examples/Image/Classification/VGG/VGG_E.ndl
@ -1,84 +0,0 @@
-load=ndlMacros
-run=DNN
-
-ndlMacros = [
-    ImageW = 224
-    ImageH = 224
-    ImageC = 3
-    LabelDim = 1000
-
-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    labels = Input(LabelDim, tag = label)
-    
-    # Kernels width and height.
-    kW = 3
-    kH = 3
-    # Kernel stride.
-    hs = 1
-    vs = 1
-    
-    # Pooling settings.
-    poolW = 2
-    poolH = 2
-    poolhs = 2
-    poolvs = 2
-    
-    # Initial parameter values.
-    convWScale = 7.07
-    convBValue = 0
-    fc1WScale = 3.0
-    fc1BValue = 1
-    fc2WScale = 3.0
-    fc2BValue = 1
-    fc3WScale = 1.0
-    fc3BValue = 1
-]
-
-DNN=[
-    cMap1 = 64
-    conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)
-    conv2 = ConvReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap3 = 128
-    conv3 = ConvReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue)
-    conv4 = ConvReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
-   
-    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap5 = 256
-    conv5 = ConvReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue)
-    conv6 = ConvReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
-    conv7 = ConvReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
-    conv8 = ConvReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap9 = 512
-    conv9 = ConvReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue)
-    conv10 = ConvReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv11 = ConvReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv12 = ConvReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap13 = 512
-    conv13 = ConvReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv14 = ConvReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv15 = ConvReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
-    conv16 = ConvReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
-
-    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    hiddenDim = 4096
-    h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
-    h1_d = Dropout(h1)
-    h2 = DnnReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
-    h2_d = Dropout(h2)
-    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
-    
-    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ClassificationError(labels, ol, tag = Eval)
-    OutputNodes = ol
-]
--- a/Examples/Image/Classification/VGG/VGG_E_BN.ndl
+++ b/Examples/Image/Classification/VGG/VGG_E_BN.ndl
@ -1,85 +0,0 @@
-load=ndlMacros
-run=DNN
-
-ndlMacros = [
-    ImageW = 224
-    ImageH = 224
-    ImageC = 3
-    LabelDim = 1000
-
-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    labels = Input(LabelDim, tag = label)
-    
-    # Kernels width and height.
-    kW = 3
-    kH = 3
-    # Kernel stride.
-    hs = 1
-    vs = 1
-    
-    # Pooling settings.
-    poolW = 2
-    poolH = 2
-    poolhs = 2
-    poolvs = 2
-    
-    # Initial parameter values.
-    convWScale = 7.07
-    convBValue = 0
-    scValue = 0.03
-    fc1WScale = 3.0
-    fc1BValue = 1
-    fc2WScale = 3.0
-    fc2BValue = 1
-    fc3WScale = 1.0
-    fc3BValue = 1
-]
-
-DNN=[
-    cMap1 = 64
-    conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv2 = ConvBNReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
-
-    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap3 = 128
-    conv3 = ConvBNReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv4 = ConvBNReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
-   
-    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap5 = 256
-    conv5 = ConvBNReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv6 = ConvBNReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv7 = ConvBNReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv8 = ConvBNReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
-
-    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap9 = 512
-    conv9 = ConvBNReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv10 = ConvBNReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv11 = ConvBNReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv12 = ConvBNReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-
-    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    cMap13 = 512
-    conv13 = ConvBNReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv14 = ConvBNReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv15 = ConvBNReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-    conv16 = ConvBNReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
-
-    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
-
-    hiddenDim = 4096
-    h1 = DnnBNReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
-    h1_d = Dropout(h1)
-    h2 = DnnBNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
-    h2_d = Dropout(h2)
-    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
-    
-    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ClassificationError(labels, ol, tag = Eval)
-    OutputNodes = ol
-]
--- a/Examples/Image/Classification/VGG/VGG_E_BN_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_E_BN_ndl_deprecated.cntk
@ -1,118 +0,0 @@
-# Note: This sample uses the deprecated NdlNetworkBuilder.
-#       An updated version using BrainScript is coming soon.
-#       Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
-#
-RootDir = "."
-
-ConfigDir = "$RootDir$"
-DataDir = "$RootDir$"
-OutputDir = "$RootDir$/Output"
-ModelDir = "$OutputDir$/Models"
-
-ndlMacros="$ConfigDir$/Macros.ndl"
-
-precision="float"
-deviceId="Auto"
-
-command=Train:AddTop5Eval:Test
-
-parallelTrain="false"
-
-stderr="$OutputDir$/VGG_E_BN"
-traceLevel=1
-
-Train=[
-    action="train"
-    modelPath="$ModelDir$/VGG_E_BN"
-
-     NDLNetworkBuilder=[
-        networkDescription="$ConfigDir$/VGG_E_BN.ndl"
-    ]
-    
-    SGD=[
-        epochSize=0
-        minibatchSize=16
-        learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
-        momentumPerMB=0.9
-        maxEpochs=70
-        gradUpdateType="None"
-        L2RegWeight=0.0005
-        dropoutRate=0*5:0.5
-        
-        ParallelTrain=[
-            parallelizationMethod="DataParallelSGD"
-            distributedMBReading="true"
-            parallelizationStartEpoch=1
-            DataParallelSGD=[
-                gradientBits=32
-            ]
-        ]
-        
-        numMBsToShowResult=10
-    ]
-    
-    reader=[
-        readerType="ImageReader"
-        # Map file which maps images to labels using the following format:
-        # <full path to image><tab><numerical label (0-based class id)>
-        # Example:
-        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
-        file="$DataDir$/train_map.txt"
-        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
-        randomize="Auto"
-        features=[
-            # Below are the required parameters.
-            width=224
-            height=224
-            channels=3
-            # Below are the optional parameters.
-            # Possible values: Center, Random. Default: Center
-            cropType="RandomSide"
-            # Horizontal random flip, will be enabled because cropType=RandomSide
-            #hflip="true"
-            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
-            sideRatio=0.875
-            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio. Default: None
-            jitterType="UniRatio"
-            # Interpolation to use when scaling image to width x height size.
-            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
-
-AddTop5Eval=[    
-    action="edit"
-    CurModel="$ModelDir$/VGG_E_BN"
-    NewModel="$ModelDir$/VGG_E_BN.Top5"
-    editPath="$ConfigDir$/CreateEvalModel.mel"
-]
-
-Test=[
-    action="test"
-    modelPath=$ModelDir$/VGG_E_BN.Top5
-    # Set minibatch size for testing.
-    minibatchSize=16
-
-    reader=[
-        readerType="ImageReader"
-        file="$DataDir$/val_map.txt"
-        randomize="None"
-        features=[
-            width=224
-            height=224
-            channels=3
-            cropType="Center"
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
--- a/Examples/Image/Classification/VGG/VGG_E_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_E_ndl_deprecated.cntk
@ -1,118 +0,0 @@
-# Note: This sample uses the deprecated NdlNetworkBuilder.
-#       An updated version using BrainScript is coming soon.
-#       Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
-#
-RootDir = "."
-
-ConfigDir = "$RootDir$"
-DataDir = "$RootDir$"
-OutputDir = "$RootDir$/Output"
-ModelDir = "$OutputDir$/Models"
-
-ndlMacros="$ConfigDir$/Macros.ndl"
-
-precision="float"
-deviceId="Auto"
-
-command=Train:AddTop5Eval:Test
-
-parallelTrain="false"
-
-stderr="$OutputDir$/VGG_E"
-traceLevel=1
-
-Train=[
-    action="train"
-    modelPath="$ModelDir$/VGG_E"
-
-     NDLNetworkBuilder=[
-        networkDescription="$ConfigDir$/VGG_E.ndl"
-    ]
-    
-    SGD=[
-        epochSize=0
-        minibatchSize=16
-        learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
-        momentumPerMB=0.9
-        maxEpochs=70
-        gradUpdateType="None"
-        L2RegWeight=0.0005
-        dropoutRate=0*5:0.5
-        
-        ParallelTrain=[
-            parallelizationMethod="DataParallelSGD"
-            distributedMBReading="true"
-            parallelizationStartEpoch=1
-            DataParallelSGD=[
-                gradientBits=32
-            ]
-        ]
-        
-        numMBsToShowResult=10
-    ]
-    
-    reader=[
-        readerType="ImageReader"
-        # Map file which maps images to labels using the following format:
-        # <full path to image><tab><numerical label (0-based class id)>
-        # Example:
-        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
-        file="$DataDir$/train_map.txt"
-        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
-        randomize="Auto"
-        features=[
-            # Below are the required parameters.
-            width=224
-            height=224
-            channels=3
-            # Below are the optional parameters.
-            # Possible values: Center, Random. Default: Center
-            cropType="RandomSide"
-            # Horizontal random flip, will be enabled because cropType=RandomSide
-            #hflip="true"
-            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
-            sideRatio=0.875
-            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio. Default: None
-            jitterType="UniRatio"
-            # Interpolation to use when scaling image to width x height size.
-            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
-
-AddTop5Eval=[    
-    action="edit"
-    CurModel="$ModelDir$/VGG_E"
-    NewModel="$ModelDir$/VGG_E.Top5"
-    editPath="$ConfigDir$/CreateEvalModel.mel"
-]
-
-Test=[
-    action="test"
-    modelPath="$ModelDir$/VGG_E.Top5"
-    # Set minibatch size for testing.
-    minibatchSize=16
-
-    reader=[
-        readerType="ImageReader"
-        file="$DataDir$/val_map.txt"
-        randomize="None"
-        features=[
-            width=224
-            height=224
-            channels=3
-            cropType="Center"
-            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
-        ]
-        labels=[
-            labelDim=1000
-        ]
-    ]    
-]
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -3880,7 +3880,9 @@ namespace CNTK
    static MomentumSchedule DefaultVarianceMomentum = MomentumAsTimeConstantSchedule(2 * 3600 * 100);

    ///
-    /// Create an instance of the CNTK built-in Adam learner (only the low-memory variant is supported at the moment).
+    /// Create an instance of Adam learner as the original paper.
+    /// Due to history reason, the legacy implementation of AdamLearner is FSAdaGrad. To keep compitability on the interface, we
+    /// will switch to the original Adam only when lowMemory = false, while keep the legacy logic when it leaves default, aka. true.
    ///
    CNTK_API LearnerPtr AdamLearner(const std::vector<Parameter>& parameters,
                                    const LearningRateSchedule& learningRateSchedule,
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -543,6 +543,48 @@ namespace CNTK
                                                s_targetAdagradAvDenom, momentum, varMomentum, UseUnitGainMomentum());
    }

+    LearnerAdam::LearnerAdam(const vector<Parameter>& parameters,
+        const LearningRateSchedule& learningRateSchedule,
+        const MomentumSchedule& momentumSchedule,
+        bool unitGain,
+        const MomentumSchedule& varianceMomentumSchedule,
+        AdditionalLearningOptions additionalOptions)
+        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
+            unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
+        m_varianceMomentumSchedule(varianceMomentumSchedule)
+    {
+        for (const auto& parameter : parameters)
+        {
+            const auto shape = GetMatrixShape(parameter);
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
+            m_smoothedGradientValues.emplace(parameter, view);
+            m_smoothedCounts.emplace(parameter, 0.0);
+        }
+    }
+
+    /*virtual*/ void LearnerAdam::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
+        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerAdam::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
+        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        GET_WRITABLE_MATRICES;
+
+        const auto learningRate = LearningRate(trainingSampleCount);
+        const auto momentum = MomentumValueForMB(trainingSampleCount);
+
+        const auto varMomentum = VarianceMomentumValueForMB(trainingSampleCount);
+
+        double& smoothedCount = m_smoothedCounts.at(parameter);
+
+        smoothedGradientMatrix->AdamUpdate(*gradientMatrix, *parameterMatrix, smoothedCount, learningRate,
+            momentum, varMomentum, UseUnitGainMomentum());
+    }
+
    LearnerRMSProp::LearnerRMSProp(const vector<Parameter>& parameters,
                                   const LearningRateSchedule& learningRateSchedule,
                                   double gamma, double inc, double dec, double max, double min,
@ -623,16 +665,21 @@ namespace CNTK
    LearnerPtr AdamLearner(const vector<Parameter>& parameters,
                           const LearningRateSchedule& learningRateSchedule,
                           const MomentumSchedule& momentumSchedule,
-                           bool unitGain,
+                           bool unitGain, /*=true*/
                           const MomentumSchedule& varianceMomentumSchedule, /*= MomentumAsTimeConstantSchedulePerSample(2 * 3600 * 100)*/
                           bool lowMemory, /*= true*/
                           AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
+        // TODO: Due to history reason, the legacy AdamLearner using FSAdaGrad implementation instead of the original paper implementation.
+        //      To keep interface backward compatible, the new adam will be enabled only when lowMemory is false.
        if (!lowMemory)
        {
-            LogicError("AdamLearner: only the low-memory variant is supported at the moment.");
+            return MakeSharedObject<LearnerAdam>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
+        }
+        else
+        {
+            return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
        }
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
    }

    LearnerPtr AdaGradLearner(const vector<Parameter>& parameters,
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -234,6 +234,36 @@ namespace CNTK
        MomentumSchedule m_varianceMomentumSchedule;
    };

+    class LearnerAdam : public LearnerMomentumSGD
+    {
+    public:
+
+        LearnerAdam(const std::vector<Parameter>& parameters,
+            const LearningRateSchedule& learningRateSchedule,
+            const MomentumSchedule& momentumSchedule,
+            bool unitGain,
+            const MomentumSchedule& varianceMomentumSchedule,
+            AdditionalLearningOptions additionalOptions);
+
+    protected:
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
+    private:
+
+        // returns current per-minibatch variance momentum value.
+        double VarianceMomentumValueForMB(size_t minibatchSize) const
+        {
+            return MomentumValueForMB(m_varianceMomentumSchedule, minibatchSize);
+        }
+
+        mutable std::unordered_map<Parameter, double> m_smoothedCounts;
+        MomentumSchedule m_varianceMomentumSchedule;
+    };
+
    class LearnerRMSProp : public LearnerBase
    {
    public:
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@ -137,7 +137,6 @@ namespace CNTK
            (op == PrimitiveOpType::CrossEntropyWithSoftmax) ||
            (op == PrimitiveOpType::ClassificationError) ||
            (op == PrimitiveOpType::Logistic) ||
-            (op == PrimitiveOpType::CosDistance) || 
            (op == PrimitiveOpType::LambdaRank) ||
            (op == PrimitiveOpType::NDCG))
        {
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -432,7 +432,20 @@ template <class ElemType>
    InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
-    auto& output =                      Value(); // output goes here
+
+#ifdef _MSC_VER
+    auto& outputValuePtrRef = ValuePtrRef();
+#else
+    auto& outputValuePtrRef = this->template ValuePtrRef();
+#endif
+    if ((source.GetMatrixType() == SPARSE) && (outputValuePtrRef->GetMatrixType() != SPARSE))
+        outputValuePtrRef = std::make_shared<Matrix<ElemType>>(outputValuePtrRef->GetNumRows(),
+                                                               outputValuePtrRef->GetNumCols(),
+                                                               outputValuePtrRef->GetPreferredDeviceId(),
+                                                               source.GetMatrixType(),
+                                                               source.GetFormat());
+
+    auto& output = Value(); // output goes here
    output.DoGatherColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
 }

@ -493,6 +506,19 @@ template <class ElemType>
    InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
+
+#ifdef _MSC_VER
+    auto& outputValuePtrRef = ValuePtrRef();
+#else
+    auto& outputValuePtrRef = this->template ValuePtrRef();
+#endif
+    if ((source.GetMatrixType() == SPARSE) && (outputValuePtrRef->GetMatrixType() != SPARSE))
+        outputValuePtrRef = std::make_shared<Matrix<ElemType>>(outputValuePtrRef->GetNumRows(),
+                                                               outputValuePtrRef->GetNumCols(),
+                                                               outputValuePtrRef->GetPreferredDeviceId(),
+                                                               source.GetMatrixType(),
+                                                               source.GetFormat());
+
    auto& output =                      Value(); // output goes here
    output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
 }
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -69,6 +69,8 @@ public:
        if (flags & CopyNodeFlags::copyNodeValue)
        {
            auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
+            node->m_beginDimParameter = m_beginDimParameter;
+            node->m_endDimParameter = m_endDimParameter;
            node->m_replacementSampleLayout = m_replacementSampleLayout;
        }
    }
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -1246,6 +1246,41 @@ void CPUMatrix<ElemType>::FSAdagrad(CPUMatrix<ElemType>& gradients,
    }
 }

+template <class ElemType>
+void CPUMatrix<ElemType>::Adam(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
+    ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
+{
+    size_t numColsNeeded = 2 * gradients.GetNumCols();
+    auto unitGainFactor = ElemType(unitGainMomentum ? (1.0 - momentum) : 1.0);
+
+    if (IsEmpty() || (GetNumCols() < numColsNeeded))
+    {
+        RequireSize(gradients.GetNumRows(), numColsNeeded);
+        SetValue(0.0);
+    }
+
+    assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
+
+    size_t n = gradients.GetNumElements();
+    ElemType* grad = gradients.Data();
+    ElemType* smoothAda = Data();
+    ElemType* smoothMom = Data() + n;
+    ElemType* val = functionValues.Data();
+#pragma omp parallel for
+    // TODO: Unroll 4-times for better performance leveraging vectorization
+    for (long i = 0; i < n; i++)
+    {
+        ElemType g = grad[i];
+        ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g;
+        smoothAda[i] = adaSqr;
+        ElemType ada = sqrt(adaSqr);
+        ElemType w = adaMul * (ElemType)( 1.0 / (ada + 1e-8));
+        g = momentum * smoothMom[i] + unitGainFactor * g;
+        smoothMom[i] = g;
+        val[i] -= g * w * learnRatePerSample;
+    }
+}
+
 template <class ElemType>
 ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
                                      ElemType RMS_GAMMA,
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -95,7 +95,10 @@ public:
    
    void FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, 
                   ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
-    
+
+    void Adam(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
+        ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
+
    ElemType RmsProp(CPUMatrix<ElemType>& gradients,
                     ElemType RMS_GAMMA,
                     ElemType RMS_WGT_INC,
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1413,6 +1413,31 @@ void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients,
                                                                         learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::Adam(GPUMatrix<ElemType>& gradients,
+    GPUMatrix<ElemType>& functionValues,
+    ElemType learnRatePerSample,
+    ElemType momentum,
+    ElemType adaWeight,
+    ElemType adaMul,
+    bool unitGainMomentum)
+{
+    size_t numColsNeeded = 2 * gradients.GetNumCols();
+
+    if (IsEmpty() || (GetNumCols() < numColsNeeded))
+    {
+        RequireSize(gradients.GetNumRows(), numColsNeeded);
+        SetValue(0.0);
+    }
+
+    assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
+
+    size_t n = gradients.GetNumElements();
+    int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
+    _adam<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(n, gradients.Data(), Data(), Data() + n, functionValues.Data(),
+        learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
+}
+
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients,
                                      ElemType RMS_GAMMA,
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -224,10 +224,13 @@ public:
    }

    ElemType Adagrad(GPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
-    
-    void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, 
+
+    void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
                   ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);

+    void Adam(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
+                  ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
+
    ElemType RmsProp(GPUMatrix<ElemType>& gradients, 
                     ElemType RMS_GAMMA, 
                     ElemType RMS_WGT_INC, 
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -5132,6 +5132,66 @@ __global__ void _maskColumnsValue(ElemType* a, const char* columnsMask, CUDA_LON
        a[IDX2C(rowIdx, colIdx, numRows)] = val;
    }
 }
+
+template <class ElemType>
+__global__ void _adam(CUDA_LONG size, ElemType* grad, ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
+    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
+{
+    const ElemType unitGainFactor = unitGainMomentum ? (1.0 - mom) : 1.0;
+    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
+    CUDA_LONG stride = blockDim.x * gridDim.x;
+    for (; idx < size; idx += stride)
+    {
+        ElemType g = grad[idx];
+        ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
+        smoothAda[idx] = adaSqr;
+        ElemType w;
+        if (sizeof(ElemType) == sizeof(double))
+        {
+            w = adaMul * rsqrt(adaSqr + 1e-8);
+        }
+        else
+        {
+            w = adaMul * rsqrtf(adaSqr + 1e-8);
+        }
+
+        g = mom * smoothMom[idx] + unitGainFactor * g;
+        smoothMom[idx] = g;
+        g = lr*g*w;
+        val[idx] -= g;
+    }
+}
+
+template <class ElemType>
+__global__ void _adam4BlockSparseCol(CUDA_LONG size,
+    ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
+    ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
+    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
+{
+    const ElemType unitGainFactor = unitGainMomentum ? (1.0 - mom) : 1.0;
+    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
+    CUDA_LONG stride = blockDim.x * gridDim.x;
+    for (; idx < size; idx += stride)
+    {
+        ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
+        ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
+        smoothAda[idx] = adaSqr;
+        ElemType w;
+        if (sizeof(ElemType) == sizeof(double))
+        {
+            w = adaMul * rsqrt(adaSqr + 1e-8);
+        }
+        else
+        {
+            w = adaMul * rsqrtf(adaSqr + 1e-8);
+        }
+
+        g = mom * smoothMom[idx] + unitGainFactor * g;
+        smoothMom[idx] = g;
+        g = lr*g*w;
+        val[idx] -= g;
+    }
+}
 }
 }
 }
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -1148,6 +1148,7 @@ void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPU
            {
                RuntimeError("Only support c += alpha * a operation");
            }
+
            int blocksPerGrid = (int) ceil(1.0 * cRows / GridDim::maxThreadsPerBlock);
            SyncGuard syncGuard;
            for (int rowInB = 0; rowInB < l; rowInB++)
@ -1546,6 +1547,39 @@ void GPUSparseMatrix<ElemType>::FSAdagrad(
        learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
 }

+template <class ElemType>
+void GPUSparseMatrix<ElemType>::Adam(
+    GPUMatrix<ElemType>& c,
+    GPUMatrix<ElemType>& functionValues,
+    ElemType learnRatePerSample,
+    ElemType momentum,
+    ElemType adaWeight,
+    ElemType adaMul,
+    bool unitGainMomentum)
+{
+    if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    size_t numColsNeeded = 2 * GetNumCols();
+
+    if (c.IsEmpty() || (c.GetNumCols() < numColsNeeded))
+    {
+        c.RequireSize(GetNumRows(), numColsNeeded);
+        c.SetValue(0.0);
+    }
+
+    assert((c.GetNumRows() == GetNumRows()) && (c.GetNumCols() == numColsNeeded));
+
+    size_t n = GetNumElements();
+    int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
+    _adam4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
+        n, Data(), ColOrRow2BlockId(), GetNumRows(),
+        c.Data(), c.Data() + n, functionValues.Data(),
+        learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
+}
+
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& c,
    ElemType RMS_GAMMA,
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -412,6 +412,7 @@ public:
    ElemType Adagrad(GPUMatrix<ElemType>& c, const bool needAveMultiplier);
    void FSAdagrad(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
    ElemType RmsProp(GPUMatrix<ElemType>& c, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
+    void Adam(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);

    static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
    static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1089,6 +1089,9 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
 {
    DecideAndMoveToRightDevice(*this, idx, a); // TODO: only move target if beta != 0

+    if (a.GetMatrixType() != this->GetMatrixType())
+        RuntimeError("Matrix::DoGatherColumnsOf: The source and target matrices must have same storage type (SPARSE/DENSE).");
+
    DISPATCH_MATRIX_ON_FLAG(&a, this,
        { m_CPUMatrix->DoGatherColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
        { m_GPUMatrix->DoGatherColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
@ -1101,8 +1104,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
            CPUSparseMatrix<ElemType> tempA(a.GetFormat(), a.GetNumRows(), a.GetNumCols(), a.m_GPUSparseMatrix->GetNumNZElements());
            a.m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempA);

-            CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(),
-                m_GPUSparseMatrix->GetNumNZElements());
+            CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(), m_GPUSparseMatrix->GetNumNZElements());
            m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempThis);

            tempThis.DoGatherColumnsOf(beta, *tempIdx.m_CPUMatrix, tempA, alpha);
@ -1121,11 +1123,27 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
 {
    DecideAndMoveToRightDevice(*this, idx, a); // TODO: only move target if beta != 0

+    if (a.GetMatrixType() != this->GetMatrixType())
+        RuntimeError("Matrix::DoScatterColumnsOf: The source and target matrices must have same storage type (SPARSE/DENSE).");
+
    DISPATCH_MATRIX_ON_FLAG(&a, this,
        { m_CPUMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
        { m_GPUMatrix->DoScatterColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
        { m_CPUSparseMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUSparseMatrix, alpha); },
-        { NOT_IMPLEMENTED; });
+        { 
+            // TODO replace by more performant version directly on GPU that does not require the round-trip over CPU.
+
+            Matrix<ElemType> tempIdx(CPUDEVICE); tempIdx.AssignValuesOf(idx);
+
+            CPUSparseMatrix<ElemType> tempA(a.GetFormat(), a.GetNumRows(), a.GetNumCols(), a.m_GPUSparseMatrix->GetNumNZElements());
+            a.m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempA);
+
+            CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(), m_GPUSparseMatrix->GetNumNZElements());
+            m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempThis);
+
+            tempThis.DoScatterColumnsOf(beta, *tempIdx.m_CPUMatrix, tempA, alpha);
+            m_GPUSparseMatrix->SetValue(tempThis);
+        });

    return *this;
 }
@ -1672,6 +1690,40 @@ void Matrix<ElemType>::FSAdagradUpdate(size_t mbSize,
    // Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
 }

+///
+// Implement the original adam algorithm according to the paper
+// Ref: ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION, https://arxiv.org/pdf/1412.6980.pdf
+///
+template <class ElemType>
+void Matrix<ElemType>::AdamUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
+    const double learnRatePerSample, const double meanMomentum, const double varMomentum, bool unitGainMomentum)
+{
+    smoothedCount++;
+    // Bias correction
+    let biasCorrection = (ElemType)(sqrt(1- pow(varMomentum, smoothedCount))/(1- pow(meanMomentum, smoothedCount)));
+
+    DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
+    {
+        m_CPUMatrix->Adam(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix,
+        (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum,
+        biasCorrection, unitGainMomentum);
+        SetDataLocation(CPU);
+    },
+    {
+        m_GPUMatrix->Adam(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix,
+        (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum,
+        biasCorrection, unitGainMomentum);
+        SetDataLocation(GPU);
+    },
+    { NOT_IMPLEMENTED; },
+    { gradients.m_GPUSparseMatrix->Adam(*m_GPUMatrix, *functionValues.m_GPUMatrix, 
+        (ElemType)learnRatePerSample, (ElemType)meanMomentum, 
+        (ElemType)varMomentum, biasCorrection, unitGainMomentum); 
+        SetDataLocation(GPU); });
+
+    // Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
+}
+
 template <class ElemType>
 ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
                                   ElemType RMS_GAMMA,
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -215,6 +215,10 @@ public:
                         Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
                         const double learnRatePerSample, const double targetAdagradAvDenom,
                         const double meanMomentum, const double varMomentum, bool unitGainMomentum = true);
+
+    void AdamUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
+        const double learnRatePerSample, const double meanMomentum, const double varMomentum, bool unitGainMomentum = true);
+
    ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);

    void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); // by default we only reallocate if need to grow
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -261,6 +261,11 @@ void GPUSparseMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>&, GPUMatrix<ElemTy
 {
 }

+template<class ElemType>
+void GPUSparseMatrix<ElemType>::Adam(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
+{
+}
+
 template<class ElemType>
 ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType, ElemType, const bool)
 {
@ -1077,6 +1082,13 @@ void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<El
 {
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::Adam(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
+    ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
+{
+
+}
+
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier)
 {
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/AlexNet_ImageNet_Distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/AlexNet_ImageNet_Distributed_test.py
@ -7,50 +7,28 @@
 import numpy as np
 import os
 import sys
+import signal
+import subprocess
+import re
+import pytest
 from cntk.ops.tests.ops_test_utils import cntk_device
 from cntk.cntk_py import DeviceKind_GPU
 from cntk.device import set_default_device
-from cntk.io import ReaderConfig, ImageDeserializer
-from cntk import distributed
-import pytest

 abs_path = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(abs_path)
-sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "AlexNet", "Python"))
+example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "AlexNet", "Python")
+sys.path.append(example_dir)
 from prepare_test_data import prepare_ImageNet_data
-from AlexNet_ImageNet_Distributed import alexnet_train_and_eval
+from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test
+script_under_test = os.path.join(example_dir, "AlexNet_ImageNet_Distributed.py")

-#TOLERANCE_ABSOLUTE = 2E-1
-
-def test_alexnet_error(device_id):
-    if cntk_device(device_id).type() != DeviceKind_GPU:
-        pytest.skip('test only runs on GPU')
-    set_default_device(cntk_device(device_id))
-
-    base_path = prepare_ImageNet_data()
-    # change dir to locate data.zip correctly
-    os.chdir(base_path)
-
-    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
-    set_computation_network_trace_level(1)
-    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
-    #force_deterministic_algorithms()
-    # TODO: do the above; they lead to slightly different results, so not doing it for now
-
-    # for test purpose we train and test on same data 
-    train_data=os.path.join(base_path, 'val1024_map.txt')
-    test_data=os.path.join(base_path, 'val1024_map.txt')    
-    
-    test_error = alexnet_train_and_eval(train_data, test_data, 
-                                        num_quantization_bits=32, 
-                                        minibatch_size=16,
-                                        epoch_size=64, 
-                                        max_epochs=2)
-    distributed.Communicator.finalize()
-#    expected_test_error = 0.0
-
-# We are removing tolerance in error because running small epoch size has huge variance in accuracy. Will add
-# tolerance back once convolution operator is determinsitic. 
-
-#    assert np.allclose(test_error, expected_test_error,
-#                       atol=TOLERANCE_ABSOLUTE)
+def test_alexnet_imagenet_distributed(device_id):
+    params = [ "-n", "2",
+               "-m", "8", 
+               "-e", "16",
+               "-datadir", prepare_ImageNet_data(),
+               "-q", "32",
+               "-r", 
+               "-device", "0" ]
+    mpiexec_test(device_id, script_under_test, params, 0.99, True)
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/ConvNet_CIFAR10_DataAug_Distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/ConvNet_CIFAR10_DataAug_Distributed_test.py
@ -16,28 +16,15 @@ from cntk.cntk_py import DeviceKind_GPU
 from cntk.device import set_default_device

 abs_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(abs_path)
 example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python")
-script_under_test = os.path.join(example_dir, "ConvNet_CIFAR10_DataAug_Distributed.py")
-
 sys.path.append(example_dir)
+from prepare_test_data import prepare_CIFAR10_data
+script_under_test = os.path.join(example_dir, "ConvNet_CIFAR10_DataAug_Distributed.py")

 TOLERANCE_ABSOLUTE = 2E-1
 TIMEOUT_SECONDS = 300

-def data_set_directory():
-    try:
-        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
-                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
-        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
-        #      and CIFAR-10_mean.xml in the base_path.
-    except KeyError:
-        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
-
-    base_path = os.path.normpath(base_path)
-    os.chdir(os.path.join(base_path, '..'))
-    return base_path
-
 def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=True, per_minibatch_tolerance=TOLERANCE_ABSOLUTE, error_tolerance=TOLERANCE_ABSOLUTE):
    if cntk_device(device_id).type() != DeviceKind_GPU:
       pytest.skip('test only runs on GPU')
@ -56,6 +43,7 @@ def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=T
    results = re.findall("Cross Validation \[.+?\]: Minibatch\[.+?\]: errs = (.+?)%", str_out)

    assert len(results) == 2
+    print(results)

    if match_exactly:
        assert results[0] == results[1]
@ -65,23 +53,32 @@ def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=T
    assert np.allclose(float(results[0])/100, expected_test_error, atol=error_tolerance)

 def test_cifar_convnet_distributed(device_id):
-    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
+    params = [ "-n", "2",
+               "-m", "64", 
+               "-e", "3200",
+               "-datadir", prepare_CIFAR10_data(),
               "-q", "32",
+               "-r",
               "-device", "0" ]
-    mpiexec_test(device_id, script_under_test, params, 0.617)
+    mpiexec_test(device_id, script_under_test, params, 0.75, True)

 def test_cifar_convnet_distributed_1bitsgd(device_id):
-    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
+    params = [ "-n", "2",
+               "-m", "64", 
+               "-e", "3200", 
+               "-datadir", prepare_CIFAR10_data(),
               "-q", "1",
+               "-r",
               "-device", "0" ]
-    mpiexec_test(device_id, script_under_test, params, 0.617)
+    mpiexec_test(device_id, script_under_test, params, 0.75, True)


 def test_cifar_convnet_distributed_block_momentum(device_id):
-    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
-               "-b", "3200",
+    params = [ "-n", "2",
+               "-m", "64", 
+               "-e", "3200",
+               "-datadir", prepare_CIFAR10_data(),
+               "-b", "1600",
+               "-r",
               "-device", "0" ]
-    mpiexec_test(device_id, script_under_test, params, 0.6457, False, 10)
+    mpiexec_test(device_id, script_under_test, params, 0.78, False, 10)
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/ConvNet_CIFAR10_DataAug_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/ConvNet_CIFAR10_DataAug_test.py
@ -30,8 +30,7 @@ def test_cifar_convnet_error(device_id):
    # change dir to locate data.zip correctly
    os.chdir(base_path)

-    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
-    set_computation_network_trace_level(1)
+    from _cntk_py import set_fixed_random_seed, force_deterministic_algorithms
    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/TrainResNet_CIFAR10_Distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/TrainResNet_CIFAR10_Distributed_test.py
@ -15,13 +15,14 @@ example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image"
 sys.path.append(example_dir)
 sys.path.append(abs_path)

-from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test, data_set_directory
+from prepare_test_data import prepare_CIFAR10_data
+from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test

 script_under_test = os.path.join(example_dir, "TrainResNet_CIFAR10_Distributed.py")

 def test_cifar_resnet_distributed(device_id):
    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
+               "-datadir", prepare_CIFAR10_data(),
               "-q", "32",
               "-es", "512",
               "-device", "0" ]
@ -29,7 +30,7 @@ def test_cifar_resnet_distributed(device_id):

 def test_cifar_resnet_distributed_1bitsgd(device_id):
    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
+               "-datadir", prepare_CIFAR10_data(),
               "-q", "1",
               "-es", "512",
               "-device", "0" ]
@ -38,7 +39,7 @@ def test_cifar_resnet_distributed_1bitsgd(device_id):

 def test_cifar_resnet_distributed_block_momentum(device_id):
    params = [ "-e", "2",
-               "-datadir", data_set_directory(),
+               "-datadir", prepare_CIFAR10_data(),
               "-b", "3200",
               "-es", "512",
               "-device", "0" ]
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/VGG16_ImageNet_Distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/VGG16_ImageNet_Distributed_test.py
@ -0,0 +1,34 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+import os
+import sys
+import signal
+import subprocess
+import re
+import pytest
+from cntk.ops.tests.ops_test_utils import cntk_device
+from cntk.cntk_py import DeviceKind_GPU
+from cntk.device import set_default_device
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(abs_path)
+example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "VGG", "Python")
+sys.path.append(example_dir)
+from prepare_test_data import prepare_ImageNet_data
+from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test
+script_under_test = os.path.join(example_dir, "VGG16_ImageNet_Distributed.py")
+
+# def test_alexnet_imagenet_distributed(device_id):
+    # params = [ "-n", "2",
+               # "-m", "1", 
+               # "-e", "2",
+               # "-datadir", prepare_ImageNet_data(),
+               # "-q", "32",
+               # "-r", 
+               # "-device", "0" ]
+    # mpiexec_test(device_id, script_under_test, params, 0.99, True)
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/prepare_test_data.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/prepare_test_data.py
@ -29,16 +29,19 @@ def prepare_CIFAR10_data():
    
 def prepare_ImageNet_data(): 
    base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                             *"../../../../Examples/Image/DataSets/ImageNet".split("/"))
+                             *"../../../../Examples/Image/DataSets/ImageNet/test_data".split("/"))
    base_path = os.path.normpath(base_path)
+    if not os.path.isdir(base_path): 
+        os.mkdir(base_path)
    
    # If val1024_map.txt don't exist locally, copy to local location
-    if not os.path.isfile(os.path.join(base_path, 'val1024_map.txt')): 
+    if not (os.path.isfile(os.path.join(base_path, 'train_map.txt')) and os.path.isfile(os.path.join(base_path, 'val_map.txt'))):
        # copy from backup location 
        base_path_bak = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
                                     *"Image/ImageNet/2012/v0".split("/"))
        base_path_bak = os.path.normpath(base_path_bak)
        
-        copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'val1024_map.txt'))
+        copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'train_map.txt'))
+        copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'val_map.txt'))
        copyfile(os.path.join(base_path_bak, 'val1024.zip'), os.path.join(base_path, 'val1024.zip'))
    return base_path
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/run_ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/run_ConvNet_CIFAR10_DataAug_Distributed.py
@ -1,43 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-# Licensed under the MIT license. See LICENSE.md file in the project root
-# for full license information.
-# ==============================================================================
-
-import numpy as np
-import os
-import sys
-import platform
-from cntk.io import ReaderConfig, ImageDeserializer, FULL_DATA_SWEEP
-from cntk import distributed
-from cntk.device import set_default_device, gpu
-
-abs_path = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(abs_path)
-sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
-from prepare_test_data import prepare_CIFAR10_data
-from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug
-
-def run_cifar_convnet_distributed():
-    base_path = prepare_CIFAR10_data()
-    # change dir to locate data.zip correctly
-    os.chdir(base_path)
-
-    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
-    set_computation_network_trace_level(1) 
-    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
-    #force_deterministic_algorithms()
-    # TODO: do the above; they lead to slightly different results, so not doing it for now
-
-    train_data = os.path.join(base_path, 'train_map.txt')
-    mean_data = os.path.join(base_path, 'CIFAR-10_mean.xml')
-    test_data = os.path.join(base_path, 'test_map.txt')
-
-    num_quantization_bits = 32
-    return convnet_cifar10_dataaug(train_data, test_data, mean_data, num_quantization_bits, epoch_size=512, max_epochs=2)
-
-if __name__=='__main__':
-    assert distributed.Communicator.rank() < distributed.Communicator.num_workers()
-    set_default_device(gpu(0)) # force using GPU-0 in test for speed
-    run_cifar_convnet_distributed()
-    distributed.Communicator.finalize()
--- a/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt
--- a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
@ -898,6 +898,36 @@ BOOST_FIXTURE_TEST_CASE(CPUMatrixSeedingDouble, RandomSeedFixture)
    BOOST_CHECK(m1.IsEqualTo(m2));
 }

+BOOST_FIXTURE_TEST_CASE(CPUMatrixAdam, RandomSeedFixture)
+{
+    CPUMatrix<double> adamMatrix;
+    CPUMatrix<double> gradients(2, 1);
+    CPUMatrix<double> parameters(2, 1);
+    CPUMatrix<double> expectedParameters(2, 1);
+    CPUMatrix<double> expectedStates(2, 2);
+    double gradientValues[] = { 0.1, -0.1 };
+    double paramValues[] = { 0.1, 0.1 };
+    double expectedValues[] = { -0.05811338, 0.25811338 };
+    double expectedStateValues[] = {1e-5, 0.01, 1e-5, -0.01};
+    gradients.SetValue(2, 1, gradientValues, matrixFormatRowMajor);
+    parameters.SetValue(2, 1, paramValues, matrixFormatRowMajor);
+    expectedParameters.SetValue(2, 1, expectedValues, matrixFormatRowMajor);
+    expectedStates.SetValue(2, 2, expectedStateValues, matrixFormatRowMajor);
+    adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
+
+    BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
+    BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
+
+    double expectedValues2[] = { -0.27059249, 0.47059249 };
+    double expectedStateValues2[] = { 2e-05, 0.019, 2e-05, -0.019 };
+    expectedParameters.SetValue(2, 1, expectedValues2, matrixFormatRowMajor);
+    expectedStates.SetValue(2, 2, expectedStateValues2, matrixFormatRowMajor);
+    adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
+
+    BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
+    BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 }
 } } }
--- a/Tests/UnitTests/MathTests/GPUMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/GPUMatrixTests.cpp
@ -537,6 +537,34 @@ BOOST_FIXTURE_TEST_CASE(GPUMatrixCurandSeedingDouble, RandomSeedFixture)
    BOOST_CHECK(m1.IsEqualTo(m2));
 }

+BOOST_FIXTURE_TEST_CASE(GPUMatrixAdam, RandomSeedFixture)
+{
+    GPUMatrix<double> adamMatrix(c_deviceIdZero);
+    GPUMatrix<double> gradients(2, 1, c_deviceIdZero);
+    GPUMatrix<double> parameters(2, 1, c_deviceIdZero);
+    GPUMatrix<double> expectedParameters(2, 1, c_deviceIdZero);
+    GPUMatrix<double> expectedStates(2, 2, c_deviceIdZero);
+    double gradientValues[] = { 0.1, -0.1 };
+    double paramValues[] = { 0.1, 0.1 };
+    double expectedValues[] = { -0.05803489, 0.25803488 };
+    double expectedStateValues[] = { 1e-5, 0.01, 1e-5, -0.01 };
+    gradients.SetValue(2, 1, c_deviceIdZero, gradientValues, matrixFormatRowMajor);
+    parameters.SetValue(2, 1, c_deviceIdZero, paramValues, matrixFormatRowMajor);
+    expectedParameters.SetValue(2, 1, c_deviceIdZero, expectedValues, matrixFormatRowMajor);
+    expectedStates.SetValue(2, 2, c_deviceIdZero, expectedStateValues, matrixFormatRowMajor);
+    adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
+    BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
+    BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
+
+    double expectedValues2[] = { -0.27046135, 0.47046134 };
+    double expectedStateValues2[] = { 2e-05, 0.019, 2e-05, -0.019 };
+    expectedParameters.SetValue(2, 1, c_deviceIdZero, expectedValues2, matrixFormatRowMajor);
+    expectedStates.SetValue(2, 2, c_deviceIdZero, expectedStateValues2, matrixFormatRowMajor);
+    adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
+    BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
+    BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
+}
+
 #if 0 // Temporarily disabling
 BOOST_FIXTURE_TEST_CASE(GPUMatrixLargeInequality, RandomSeedFixture)
 {
--- a/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
@ -95,6 +95,15 @@ void TestFSAdaGradLearner(size_t numParameters, size_t numMinibatches, bool unit
    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
 }

+template <typename ElementType>
+void TestAdamLearner(size_t numParameters, size_t numMinibatches, bool unitGainMomentum, const DeviceDescriptor& device)
+{
+    NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
+    auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
+    auto learner = AdamLearner(parameters, LearningRatePerSampleSchedule({ 0.5 }), MomentumAsTimeConstantSchedule({ 10.0, 100.0, 1000.0 }), unitGainMomentum, MomentumPerSampleSchedule(0.99), false);
+    TestUpdate<ElementType>(learner, shape, numMinibatches, device);
+}
+
 template <typename ElementType>
 void TestRMSPropLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
 {
@ -335,6 +344,8 @@ void LearnerTests()
            TestMomentumSGDLearner<float>(numParameters, numMinibatches, unitGain, device);
            TestNesterovLearner<float>(numParameters, numMinibatches, unitGain, device);
            TestFSAdaGradLearner<double>(numParameters, numMinibatches, unitGain, device);
+            TestAdamLearner<float>(numParameters, numMinibatches, unitGain, device);
+            TestAdamLearner<double>(numParameters, numMinibatches, unitGain, device);
        }
    }
-}
+}
--- a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
--- a/Tools/samples.json
+++ b/Tools/samples.json
@ -76,7 +76,7 @@
        "name":  "Fast R-CNN",
        "url":  "https://github.com/Microsoft/CNTK/wiki/Object-Detection-using-Fast-R-CNN",
        "description":  "Train object detection from images by adapting pre-trained classification models on arbitrarily sized regions of interest using ROI pooling.",
-        "language":  ["BrainScript"],
+        "language":  ["Python", "BrainScript"],
        "type":  ["Tutorial", "Recipe"]
    },
    {
@ -204,7 +204,7 @@
        "name":  "VGG",
        "url":  "https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Classification/VGG",
        "description":  "Deep CNN from University of Oxford. This was the winning model for the ILSVRC2014 localization task.",
-        "language":  ["BrainScript"],
+        "language":  ["Python", "BrainScript"],
        "type":  ["Recipe"]
    },
    {
--- a/Tutorials/CNTK_104_Finance_Timeseries_Basic_with_Pandas_Numpy.ipynb
+++ b/Tutorials/CNTK_104_Finance_Timeseries_Basic_with_Pandas_Numpy.ipynb
--- a/bindings/csharp/Swig/cntk_cs.i
+++ b/bindings/csharp/Swig/cntk_cs.i
--- a/bindings/python/cntk/core.py
+++ b/bindings/python/cntk/core.py
@ -292,3 +292,10 @@ class Value(cntk_py.Value):
        '''
        return self.shape[0]

+def user_function(user_func):
+    '''
+    Wraps the passed Function to create a composite representing the
+    composite Function graph rooted at the passed root Function.
+    '''
+    from . import as_composite
+    return as_composite(user_func)
--- a/bindings/python/cntk/graph.py
+++ b/bindings/python/cntk/graph.py
@ -42,6 +42,11 @@ def depth_first_search(node, visitor):
                pass

        if visitor(node):
+            if node.is_parameter:
+                node = node.as_parameter()
+            elif node.is_constant:
+                node = node.as_constant()
+
            accum.append(node)

        visited.add(node)
--- a/bindings/python/cntk/layers.py
+++ b/bindings/python/cntk/layers.py
@ -349,7 +349,10 @@ def Recurrence(over, go_backwards=False, initial_state=initial_state_default_or_
    f_x_h_c = over(x, prev_state) # apply the recurrent over
    # this returns a Function (x, (h_prev, c_prev)) -> (h, c)
    h_c = f_x_h_c.outputs
-    replacements = { value_forward: value for (value_forward, value) in zip(list(_as_tuple(state_forward)), h_c) }
+    if type(state_forward) is tuple and len(state_forward) > 1: 
+      replacements = { value_forward: value for (value_forward, value) in zip(list(_as_tuple(state_forward)), h_c) }
+    else:
+      replacements = {(state_forward,)[0] : h_c[0] }
    f_x_h_c.replace_placeholders(replacements)  # resolves state_forward := h_c
    h = f_x_h_c.outputs[0]  # 'h' is a Variable (the output of a Function that computed it)
    if _trace_layers:
--- a/bindings/python/cntk/learner.py
+++ b/bindings/python/cntk/learner.py
@ -564,9 +564,6 @@ def adam_sgd(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
        <http://arxiv.org/abs/1412.6980>`_. International Conference for
        Learning Representations, 2015. 
    '''
-    if not low_memory:
-        raise NotImplementedError('adam: low_memory=True currently required')
-
    _verify_learning_rate_type(lr)
    _verify_momentum_type(momentum)
    _verify_momentum_type(variance_momentum)
--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@ -647,11 +647,7 @@ class UserFunction(Function):

    '''
    def __init__(self, inputs, name=''):
-        # FIXME we need to save a reference here so that the function does not
-        # disappear
-        self.var_inputs = inputs
-
-        super(Function, self).__init__(inputs, name)
+        super(UserFunction, self).__init__(inputs, name)

        # Memory management for user defined functions has to be controlled by
        # the C++ side. For more information:
@ -753,11 +749,22 @@ class UserFunction(Function):
        outputs.extend(self.infer_outputs())

    def infer_outputs(self):
-        raise NotImplementedError('infer_outputs has to be overridden')
+        '''
+        Returns a list of all output variables this user-defined function
+        outputs.
+
+        Output variables are created by
+        :meth:`~cntk.ops.functions.output_variable`.
+        '''
+        raise NotImplementedError('infer_outputs has to be overwritten')

    def op_name(self):
+        '''
+        Returns the operator name.
+        '''
        return 'UserFunction'

+
@typemap
 def load_model(filename, device=None):
    '''
--- a/bindings/python/cntk/ops/tests/cosine_distance_test.py
+++ b/bindings/python/cntk/ops/tests/cosine_distance_test.py
@ -0,0 +1,29 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+"""
+Unit tests for the cosine distance class.
+"""
+
+import numpy as np
+import pytest
+from .. import *
+from ...axis import Axis
+from ... import sequence
+
+def test_cosine_distance():
+  a = np.reshape(np.arange(25.0, dtype = np.float32), (5,5))
+  b = np.reshape(np.arange(0, 5, dtype=np.float32), (1,5))
+  
+  src = input_variable(shape=(5), dynamic_axes=[ Axis.default_batch_axis(), Axis("Seq")])
+  tgt = input_variable(shape=(5))
+  tgt_br = sequence.broadcast_as(tgt, src)
+  cos_seq = cosine_distance(src, tgt_br)
+  assert len(cos_seq.dynamic_axes)==2
+  assert cos_seq.dynamic_axes[1].name=="Seq"
+  val = cos_seq.eval({src:[a], tgt:[b]})
+  expected = [[ 1.,        0.914659,  0.878459,  0.86155,   0.851852]] 
+  print(np.allclose(val, expected))
--- a/bindings/python/cntk/ops/tests/reshaping_test.py
+++ b/bindings/python/cntk/ops/tests/reshaping_test.py
@ -14,7 +14,7 @@ import pytest
 from .ops_test_utils import unittest_helper, _test_unary_op, _test_binary_op, AA, I, precision, PRECISION_TO_TYPE, cntk_device
 import cntk as C
 from cntk.axis import Axis
-from ...utils import sanitize_dtype_cntk
+from ...utils import sanitize_dtype_cntk, one_hot
 from .. import constant

 EPS_IN_LOG = 1e-37        # 1e-37 is the highest guaranteed precision
@ -395,3 +395,39 @@ def test_op_gather_derived_dynamic_axes_equivalence(device_id, precision):
    res = z.eval({a: input_data1, b: input_data2})
    expected_forward = [[[3.]]]
    assert np.array_equal(res, expected_forward)
+
+
+def test_op_gather_sparse(device_id):
+    from .. import sequence, times
+
+    input_sparse_indices = [[1, 3, 5], [2, 4]]
+    vocab_size = 6
+    input_data = one_hot(input_sparse_indices, vocab_size)
+
+    a = I(shape=(vocab_size,), is_sparse=True, name='a')
+
+    a_last = sequence.last(a)
+    a_last_dense = times(a_last, np.eye(vocab_size))
+    res = a_last_dense.eval({a : input_data})
+    assert np.array_equal(res, [[[0, 0, 0, 0, 0, 1]], [[0, 0, 0, 0, 1, 0]]])
+
+    a_last_2 = sequence.slice(a, -2, 0)
+    a_last_2_dense = times(a_last_2, np.eye(vocab_size))
+    res = a_last_2_dense.eval({a : input_data})
+    assert np.array_equal(res, [[[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1]], [[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0]]])
+
+
+def test_op_scatter_sparse(device_id):
+    from .. import sequence, times
+
+    input_sparse_indices = [[1, 3, 5], [2, 4]]
+    vocab_size = 6
+    input_data = one_hot(input_sparse_indices, vocab_size)
+
+    a = I(shape=(vocab_size,), is_sparse=True, name='a')
+
+    a_last_scatter = sequence.scatter(sequence.last(a), sequence.is_first(a))
+    a_last_scatter_dense = times(a_last_scatter, np.eye(vocab_size))
+    res = a_last_scatter_dense.eval({a : input_data})
+    assert np.array_equal(res[0], np.asarray([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]))
+    assert np.array_equal(res[1], np.asarray([[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]))
--- a/bindings/python/cntk/ops/tests/userfunction_complex_test.py
+++ b/bindings/python/cntk/ops/tests/userfunction_complex_test.py
@ -46,7 +46,8 @@ def linear_layer(input_var, output_dim):

 def dense_layer(input, output_dim, nonlinearity):
    r = linear_layer(input, output_dim)
-    r = nonlinearity(r)
+    if isinstance(nonlinearity, UserFunction):
+        r = user_function(nonlinearity(r))
    return r

 def fully_connected_classifier_net(input, num_output_classes, hidden_layer_dim,
--- a/bindings/python/cntk/ops/tests/userfunction_test.py
+++ b/bindings/python/cntk/ops/tests/userfunction_test.py
@ -25,7 +25,8 @@ class MyPlus(UserFunction):
        self.backward_calls = 0

    def infer_outputs(self):
-        return [output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)]
+        return [output_variable(self.inputs[0].shape,
+            self.inputs[0].dtype, self.inputs[0].dynamic_axes)]

    def forward(self, arguments, device=None, outputs_to_retain=None):
        assert len(self.inputs)==2
@ -45,7 +46,7 @@ def test_ext_eval_1():
    dim = 4
    p = parameter(shape=(dim,), init=10, name='p')
    i = input_variable(dim, needs_gradient=True, name='i_var')
-    m = MyPlus(i, constant(3))
+    m = user_function(MyPlus(i, constant(3)))
    z = m+p

    input_data = np.random.rand(dim)
@ -56,7 +57,7 @@ def test_ext_eval_2_only_param():
    dim = 4
    p = parameter(shape=(dim,), init=10, name='p')
    i = input_variable(dim, needs_gradient=True, name='i_var')
-    m = MyPlus(p, constant(3))
+    m = user_function(MyPlus(p, constant(3)))
    # combine does not work
    # z = combine([m.output])
    z = m+i
@ -68,7 +69,7 @@ def test_ext_eval_2_only_param():
 def test_ext_eval_3_no_input():
    dim = 4
    p = parameter(shape=(dim,), init=10, name='p')
-    m = MyPlus(p, constant(3))
+    m = user_function(MyPlus(p, constant(3)))
    z = m+0

    result = z.eval()
@ -79,7 +80,7 @@ def test_ext_eval_4_a_inside_graph():
    dim = 4
    p_init = 10
    p = parameter(shape=(dim,), init=p_init, name='p')
-    m = MyPlus(p, constant(3))
+    m = user_function(MyPlus(p, constant(3)))
    z = p * m

    result = z.eval()
@ -90,7 +91,7 @@ def test_ext_eval_4_b_inside_graph():
    dim = 4
    p_init = 10
    p = parameter(shape=(dim,), init=p_init, name='p')
-    z = p * MyPlus(p, constant(3))
+    z = user_function(p * MyPlus(p, constant(3)))

    result = z.eval()
    # No batch dimension since we have no input
@ -100,14 +101,14 @@ def test_ext_eval_5_times():
    dim = 2
    p_init = 10
    p = parameter(shape=(dim,), init=p_init, name='p')
-    m = MyPlus(p, constant(3))
+    m = user_function(MyPlus(p, constant(3)))
    z = times(m, parameter(shape=(2,50), init=2))

    result = z.eval()
    # No batch dimension since we have no input
    assert np.allclose(result, ((p_init*np.ones_like(result))+3)*2*2)

-def test_ext_clone():
+def test_ext_eval_6_clone():
    dim = 4
    i = input_variable(dim, needs_gradient=True, name='i_var')
    m = i + 3
@ -115,20 +116,34 @@ def test_ext_clone():
    p = parameter(shape=(dim,), init=10, name='p')
    z = m + p
    
-    m_udf = MyPlus(i, constant(3))
+    m_udf = user_function(MyPlus(i, constant(3)))
    z_clone = z.clone('share', {m : m_udf} );

    input_data = np.random.rand(dim)
    result = z_clone.eval([input_data])
    assert np.allclose(result[0][0], input_data+3+10)

+def test_ext_eval_7_placeholder():
+    dim = 4
+    p = parameter(shape=(dim,), init=10, name='p')
+    i = input_variable(dim, needs_gradient=True, name='i_var')
+    pl = placeholder_variable()
+    m = user_function(MyPlus(pl, constant(3)))
+    z = m+p
+    z.replace_placeholder(i)
+
+    input_data = np.random.rand(dim)
+    result = z.eval([input_data])
+    assert np.allclose(result[0][0], input_data+3+10)
+
 def test_ext_train():
    dim = 4

    p = parameter(shape=(dim,), init=10)
    i = input_variable(dim, needs_gradient=True, name='i_var')
    m = MyPlus(i, constant(3))
-    z = m+p
+    # keeping m unwrapped since we need to access its member variables
+    z = user_function(m)+p

    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
@ -172,7 +187,7 @@ def test_ext_backpropstate(payload):

    p = parameter(shape=(dim,), init=10)
    in1 = input_variable(dim, needs_gradient=True, name='i_var')
-    m = TestBackPropState(in1, payload)
+    m = user_function(TestBackPropState(in1, payload))
    z = m+p

    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
@ -223,6 +238,7 @@ def test_ext_lambdafunc():
    m = LambdaFunc(k,
            when=lambda arg: np.sum(arg)>1,
            execute=cb.inc)
+    m = user_function(m)
    z = m+0

    momentum_time_constant = momentum_as_time_constant_schedule(1100)
@ -262,7 +278,7 @@ def test_udf_plus_and_last():
    x = input_variable(shape=(2,))
    y = input_variable(shape=(2,), dynamic_axes=[Axis.default_batch_axis()])
    
-    func = as_composite(PlusAndLast(x, y))
+    func = user_function(PlusAndLast(x, y))

    dt_precision = np.float32
    operand1 = [AA([[1., 2.], [3., 4.]], dtype=dt_precision)]
--- a/bindings/python/cntk/ops/variables.py
+++ b/bindings/python/cntk/ops/variables.py
@ -132,6 +132,26 @@ class Variable(VariableMixin, TensorOpsMixin, cntk_py.Variable):
        super(Variable, self).__init__(shape, is_sparse, dtype, needs_gradient, name,
                         dynamic_axes)

+    @typemap
+    def as_parameter(self):
+        '''
+        Converts this instance into a :class:`Parameter`
+        '''
+        if not self.is_parameter:
+            raise TypeError('cannot be converted into a Parameter')
+
+        return cntk_py.Parameter(self)
+
+    @typemap
+    def as_constant(self):
+        '''
+        Converts this instance into a :class:`Constant`
+        '''
+        if not self.is_constant:
+            raise TypeError('cannot be converted into a Constant')
+
+        return cntk_py.Constant(self)
+

 class Parameter(VariableMixin, TensorOpsMixin, cntk_py.Parameter):
    '''
--- a/bindings/python/cntk/tests/graph_test.py
+++ b/bindings/python/cntk/tests/graph_test.py
@ -20,15 +20,13 @@ def _graph_dict():

    d['i1'] = input_variable(
        shape=(2, 3), dynamic_axes=input_dynamic_axes, name='i1')
-    d['i2'] = input_variable(
-        shape=(2, 3), dynamic_axes=input_dynamic_axes, name='i2')
+    d['c1'] = constant(shape=(2, 3), value=6, name='c1')
+    d['p1'] = parameter(shape=(3, 2), init=7, name='p1')

-    d['p1'] = parameter(shape=(3, 2), name='p1')
-
-    d['op1'] = plus(d['i1'], d['i2'], name='op1')
+    d['op1'] = plus(d['i1'], d['c1'], name='op1')
    d['op2'] = times(d['op1'], d['p1'], name='op2')

-    #d['slice'] = slice(d['i2'], Axis.default_dynamic_axis(), 0, 3)
+    #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3)
    #label_sentence_start = sequence.first(raw_labels)

    # no name
@ -49,9 +47,9 @@ def _simple_dict():
    d = {}

    d['i1'] = input_variable(shape=(2, 3), name='i1')
-    d['i2'] = input_variable(shape=(2, 3), name='i2')
-    d['p1'] = parameter(shape=(3, 2), name='p1')
-    d['op1'] = plus(d['i1'], d['i2'], name='op1')
+    d['c1'] = constant(shape=(2, 3), value=6, name='c1')
+    d['p1'] = parameter(shape=(3, 2), init=7, name='p1')
+    d['op1'] = plus(d['i1'], d['c1'], name='op1')
    d['op2'] = times(d['op1'], d['p1'], name='op2')
    d['root'] = d['op2']

@ -64,7 +62,7 @@ def _simple_dict():
 def test_find_nodes():
    d = _graph_dict()

-    for name in ['i1', 'i2', 'p1', 'op1', 'op2', 'past']:
+    for name in ['i1', 'c1', 'p1', 'op1', 'op2', 'past']:
        n = find_all_with_name(d['root'], name)
        assert len(n) == 1, name
        assert n[0].name == name, name
@ -89,6 +87,17 @@ def test_find_nodes():

    assert find_by_name(d['root'], 'none') is None

+def test_find_nodes_returning_proper_types():
+    d = _graph_dict()
+
+    c1 = find_by_name(d['root'], 'c1')
+    assert isinstance(c1, Constant)
+    assert np.allclose(c1.value, np.zeros((2,3))+6)
+
+    p1 = find_by_name(d['root'], 'p1')
+    assert isinstance(p1, Parameter)
+    assert np.allclose(p1.value, np.zeros((3,2))+7)
+

 def test_plot():
    d = _simple_dict()
@ -107,4 +116,4 @@ def test_depth_first_search():

    found = depth_first_search(d['op2'], lambda x:True)
    found_names = [v.name for v in found]
-    assert found_names == ['op2', 'op1', 'i1', 'i2', 'p1']
+    assert found_names == ['op2', 'op1', 'i1', 'c1', 'p1']
--- a/bindings/python/cntk/tests/layers_test.py
+++ b/bindings/python/cntk/tests/layers_test.py
@ -8,6 +8,10 @@ import numpy as np
 import pytest

 from ..layers import *
+from ..blocks import  init_default_or_glorot_uniform, Parameter, _INFERRED, Placeholder
+from ..utils import _as_tuple
+from ..ops import  sigmoid, times, tanh, element_times, plus, combine, input_variable
+from ..axis import Axis

 def test_layers_name(device_id): 
    from cntk import placeholder_variable, combine
@ -19,3 +23,45 @@ def test_layers_name(device_id):
    q = Convolution((3,3), 3, name='conv33')(I)
    assert(q.root_function.name == 'conv33')

+def gru_cell(shape, init=init_default_or_glorot_uniform, name=''): # (x, (h,c))
+  shape = _as_tuple(shape)
+
+  if len(shape) != 1 :
+    raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)")
+
+  # determine stacking dimensions
+  cell_shape_stacked = shape * 2  # patched dims with stack_axis duplicated 4 times
+
+  # parameters
+  Wz = Parameter(cell_shape_stacked, init = init, name='Wz')
+  Wr = Parameter(cell_shape_stacked, init = init, name='Wr')
+  Wh = Parameter(cell_shape_stacked, init = init, name='Wh')
+  Uz = Parameter( _INFERRED + shape, init = init, name = 'Uz')
+  Ur = Parameter( _INFERRED + shape, init = init, name = 'Ur')
+  Uh = Parameter( _INFERRED + shape, init = init, name = 'Uh')
+
+  def create_s_placeholder():
+    # we pass the known dimensions here, which makes dimension inference easier
+    return Placeholder(shape=shape, name='S') # (h, c)
+
+  # parameters to model function
+  x = Placeholder(name='gru_block_arg')
+  prev_status = create_s_placeholder()
+
+  # formula of model function
+  Sn_1 = prev_status
+
+  z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z')
+  r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r')
+  h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h')
+  s = plus(element_times((1-z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name)
+  apply_x_s = combine([s])
+  apply_x_s.create_placeholder = create_s_placeholder
+  return apply_x_s
+
+def test_recurrence():
+  r = Recurrence(gru_cell(5), go_backwards=False)
+  a = input_variable(shape=(5,), dynamic_axes=[Axis.default_batch_axis(), Axis('Seq')])
+  x = np.reshape(np.arange(0,25, dtype=np.float32), (1,5,5))
+  rt = r(a).eval({a:x})
+  print(rt)
--- a/bindings/python/cntk/utils/init.py
+++ b/bindings/python/cntk/utils/init.py
@ -34,6 +34,8 @@ def sanitize_precision(precision):
        return np.float32
    elif precision in [cntk_py.DataType_Double, 'double', 'float64', np.float64]:
        return np.float64
+    elif precision in [cntk_py.DataType_Unknown]:
+        return None
    else:
        raise ValueError('precision value: "%s" is not supported' % precision)

@ -112,6 +114,7 @@ def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
      ``arg`` is a number or NumPy array. Variable otherwise.
    """

+    from cntk.ops.functions import UserFunction
    from cntk.ops.variables import Constant, Variable, Parameter
    from cntk.ops.functions import Function
    from cntk.ops import constant
@ -483,6 +486,8 @@ def sanitize_dtype_cntk(dtype):
        return cntk_py.DataType_Float
    elif dtype == np.float64:
        return cntk_py.DataType_Double
+    elif dtype == object:
+        return cntk_py.DataType_Unknown
    else:
        raise ValueError('data type "%s" is not supported' % dtype)

--- a/bindings/python/doc/extend.rst
+++ b/bindings/python/doc/extend.rst
@ -48,7 +48,10 @@ tuple, strings, etc.)::

 This can now be used as a normal operator like::

-    s = MySigmoid(prev_node)
+    s = user_function(MySigmoid(prev_node))
+
+Note that we cannot pass the `UserFunction` instance directly into the graph. 
+It is representing a primitive function, which we have to pass through `user_function()`.

 In case, the operator is initialized with multiple inputs, ``forward()`` 's
 ``argument`` will be a list of those inputs::
@ -133,7 +136,7 @@ interesting behavior, for instance::
    debug_node = LambdaFunc(node,
            when=lambda arg: np.var(arg)>1,
            execute=lambda arg: pdb.set_trace())
-    # out = ... using debug_node ...
+    # out = ... using user_function(debug_node) ...
    # ... training out

 Now, if the variance of the input tensor exceeds 1, we will be put into