Merge branch 'master' into qiwye/asgd-dev

Conflicts: Source/1BitSGD Source/SGDLib/SGD.cpp
2016-07-25 20:13:27 +08:00 · 2016-07-25 20:13:27 +08:00 · 5a33a35eac
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1133,6 +1133,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{6826
 	ProjectSection(SolutionItems) = preProject
 		Scripts\pytest.ini = Scripts\pytest.ini
 		Scripts\txt2ctf.py = Scripts\txt2ctf.py
+		Scripts\uci2ctf.py = Scripts\uci2ctf.py
 	EndProjectSection
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ManagedEvalTests", "Tests\UnitTests\ManagedEvalTests\ManagedEvalTests.csproj", "{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}"
@ -1142,6 +1143,9 @@ EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "Examples\Evaluation\CSEvalClient\CSEvalClient.csproj", "{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\Evaluation\CPPEvalClient\CPPEvalClient.vcxproj", "{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}"
+	ProjectSection(ProjectDependencies) = postProject
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
+	EndProjectSection
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/Documentation/Documents/Network
+++ b/Documentation/Documents/Network
@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling

 This function is often associated with Convolution() operations.

-### Delay
+### PastValue, FutureValue

-Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
+PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.

-`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
+`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
+`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`

 #### Parameters

-`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
+`rows` – number of rows in the node

-`kernelWidth` – width of the kernel
+`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies

-`kernelHeight` – height of the kernel
+`timeStep` – \[default = 1\] number of time steps toward the past and future

-`outputChannels` – number of output channels
-
-`horizontalSubsample` – subsamples in the horizontal direction
-
-`verticalSubsample` – subsamples in the vertical direction
-
-#### Optional Parameters
-
-`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
-
-`needGradient` – \[default = true\] does the gradient need to be computed for this node
-
-`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
+`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.

 #### Returns

-The results of the completed Delay loop
+Eitehr the past or future value of a node

 #### Notes

-This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
+This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.
--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
@ -37,34 +37,23 @@ int main(int argc, char* argv[])
    std::string app = argv[0];
    std::string path; 
    IEvaluateModel<float> *model;
+    size_t pos;

 #ifdef _WIN32
-    path = app.substr(0, app.rfind("\\"));
-    // Load the eval library
-    auto hModule = LoadLibrary(L"evaldll.dll");
-    if (hModule == nullptr)
-    {       
-        fprintf(stderr, "Cannot find evaldll.dll library.");
-        return 1;
-    }
-
-    // Get the factory method to the evaluation engine
-    std::string func = "GetEvalF";
-    auto procAddress = GetProcAddress(hModule, func.c_str());
-    auto getEvalProc = (GetEvalProc<float>)procAddress;
-
-    // Native model evaluation instance   
-    getEvalProc(&model);
+    pos = app.rfind("\\");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
    const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
 #else // on Linux
-    path = app.substr(0, app.rfind("/"));
-    GetEvalF(&model);
+    pos = app.rfind("/");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
    const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
 #endif
+    
+    GetEvalF(&model);

    const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";

--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj
@ -69,7 +69,7 @@
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
    </Link>
--- a/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
+++ b/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
@ -110,9 +110,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        public static bool Evaluate(string record)
        {
            var model = Models.Take();
-            var outcome = model.EvaluateRecord(record);
-            Models.Add(model);
-            return outcome;
+            try
+            {
+                var outcome = model.EvaluateRecord(record);
+                return outcome;
+            }
+            finally
+            { 
+                Models.Add(model); 
+            }
        }

        /// <summary>
@ -123,9 +129,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        public static List<float> Evaluate(List<float> inputs)
        {
            var model = Models.Take();
-            var outcome = model.EvaluateInput(inputs);
-            Models.Add(model);
-            return outcome;
+            try
+            {
+                var outcome = model.EvaluateInput(inputs);
+                return outcome;
+            }
+            finally
+            {
+                Models.Add(model);
+            }
        }

        /// <summary>
--- a/Examples/Text/ATIS/ATIS.cntk
+++ b/Examples/Text/ATIS/ATIS.cntk
@ -0,0 +1,226 @@
+# The configuration file to build language understanding model with ATIS corpus.
+# An LSTM model is built to tag each word in sentences with its semantic label.
+
+WorkDir = work
+DataDir = data
+
+modelPath = $WorkDir$/ATIS.slot.lstm
+parallelTrain = true
+
+#stderr = $WorkDir$/log
+
+command = Train:Output:Test
+
+precision = "float"
+deviceId = "-1"        # change to "auto" to use GPUs
+
+wordCount = 944    # number of words
+labelCount = 127   # number of labels 
+
+# The command to train the LSTM model
+Train = [
+    action = train
+    BrainScriptNetworkBuilder = [
+        inputDim = $wordCount$
+        labelDim = $labelCount$
+        featDim = inputDim*3   # contextual words are used as features: previous word, current word, next word.
+        embDim = 150
+        hiddenDim = 300
+        maxLayer = 1
+        initScale = 6
+        featuresPW = Input(inputDim)    # the previous word
+        featuresCW = Input(inputDim)    # the current word
+        featuresNW = Input(inputDim)    # the next word
+        features = RowStack(featuresPW : featuresCW : featuresNW)
+        
+        labels = Input(labelDim, tag = "label")
+        
+        # embedding layer
+        emb = Parameter(embDim, featDim)
+        featEmbedded = emb * features
+        
+        # build the LSTM stack
+        lstmDims[i:0..maxLayer-1] = hiddenDim
+        NoAuxInputHook (input, lstmState) = BS.Constants.None
+        lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims, 
+            cellDims=lstmDims,
+            featEmbedded, 
+            inputDim=embDim,
+            previousHook=BS.RNNs.PreviousHC,
+            augmentInputHook=BS.RNNs.NoAuxInputHook, 
+            augmentInputDim=0,
+            enableSelfStabilization=false)
+
+        lstmOutputLayer = Length (lstmStack)-1
+        LSTMoutput = lstmStack[lstmOutputLayer].h
+    
+        W = Parameter(labelDim, hiddenDim, init = "uniform", initValueScale=initScale)
+        b = Parameter(labelDim, 1, init = "fixedValue", value=0)
+        outputs = W * LSTMoutput + b
+        
+        cr = CrossEntropyWithSoftmax(labels, outputs)
+
+        criterionNodes = (cr)
+        evaluationNodes = (cr)
+        outputNodes = (outputs)
+    ]
+
+    SGD = [
+        # maximum number of epochs
+        maxEpochs = 1   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
+
+        # for each epoch, maximum number of input samples(words) is set below
+        epochSize = 36000   
+
+        # minibatchSize should be larger than the maximum sentence length
+        minibatchSize = 70
+
+        learningRatesPerSample = 0.01*2:0.005*12:0.001
+        gradUpdateType = "FSAdaGrad"
+
+        gradientClippingWithTruncation = true
+        clippingThresholdPerSample = 15.0
+
+        # number of minibatches to report progress
+        numMBsToShowResult = 100
+        
+        firstMBsToShowResult = 10 
+        
+        # if validation shows that the model has no improvement, then do back-up to the previously
+        # estimated model and reduce learning rate
+        loadBestModel = true
+
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            parallelizationStartEpoch = 2
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+    ]
+
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.train.cntk.sparse" 
+        randomize = true
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]   
+]
+
+# Evaluate the model to predict labels
+Output = [
+    action = "write"
+
+    traceLevel = 1
+    epochSize = 0
+
+    defaultHiddenActivity = 0.1
+    BrainScriptNetworkBuilder = [
+        modelAsTrained = BS.Network.Load ("$modelPath$")
+        final = Hardmax(modelAsTrained.outputs)
+    ]
+    
+    outputPath = $WorkDir$/model.writeaction
+    outputNodeNames = final
+    
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.test.cntk.sparse" 
+
+        randomize = false
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]
+]
+
+# Evaluate the model's accuracy 
+Test = [
+    action = "test"
+
+    traceLevel = 1
+    epochSize = 0
+
+    defaultHiddenActivity = 0.1
+    BrainScriptNetworkBuilder = [
+        labels = Input($labelCount$, tag = "label")
+        modelAsTrained = BS.Network.Load ("$modelPath$")
+        final = Hardmax(modelAsTrained.outputs)
+        errorRate = ErrorPrediction(labels, final, tag='evaluation')
+    ]
+    
+    evalNodeNames  = errorRate
+    
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.test.cntk.sparse" 
+
+        randomize = false
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]
+]
--- a/Examples/Text/ATIS/README.md
+++ b/Examples/Text/ATIS/README.md
@ -0,0 +1,168 @@
+# Build Language Understanding Models with CNTK
+
+This example demonstrates how to use build language understanding model with CNTK using ATIS data set. This example is similar to 
+[SLU example](https://github.com/Microsoft/CNTK/tree/master/Examples/Text/Miscellaneous/SLU). They are different in that
+  - CNTKTextFormatReader is used here, instead of LUSequenceReader
+  - With CNTKTextFormatReader, the input format is much more flexible. In the example setting, sparse contextual feature vectors are explored
+  - Sparse label input is used.
+
+The Air travel information system (ATIS) corpus is used for training and testing.
+## Download the example
+The data and configuration is checked in to github. You can get it by command:
+
+`git clone https://github.com/Microsoft/cntk`
+
+The example is under folder: 
+`<cntk_root>\Examples\Text\ATIS`
+
+## Data File Format
+There are four files under `data` sub-folder
+
+|Files                  |Content |
+|:----------------------|:--------|
+|ATIS.train.cntk.sparse |featurized training data set  
+|ATIS.test.cntk.sparse  |featurized test data set 
+|ATIS.vocab             |all words extracted from training data. Vocab size: 944 
+|ATIS.labels            |all semantic labels extracted from training data. Total labels: 127 
+
+We preprocess ATIS data by converting words into word indexes, and labels into label IDs in order to use 
+[CNTKTextFormatReader](https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader). You can use any 
+script/tool to preprocess your text data files. In this example, data is already preprocessed.
+
+The last two files ATIS.vocab and ATIS.labels are not really required to run the example. They are included for evaluation and debugging purpose. 
+E.g. they can be used to convert .sparse files back to original text files. 
+
+To understand the data format (two .sparse files), let's start with a sample sentence:
+```
+BOS i would like to find a flight from charlotte to Las Vegas that makes a stop in St. Louis EOS
+```
+it is converted into the following text:
+```
+1    |PW 1:1     |CW 1:1     |NW 12:1    |L 126:1
+1    |PW 1:1     |CW 12:1    |NW 39:1    |L 126:1
+1    |PW 12:1    |CW 39:1    |NW 28:1    |L 126:1
+1    |PW 39:1    |CW 28:1    |NW 3:1     |L 126:1
+1    |PW 28:1    |CW 3:1     |NW 86:1    |L 126:1
+1    |PW 3:1     |CW 86:1    |NW 15:1    |L 126:1
+1    |PW 86:1    |CW 15:1    |NW 10:1    |L 126:1
+1    |PW 15:1    |CW 10:1    |NW 4:1     |L 126:1
+1    |PW 10:1    |CW 4:1     |NW 101:1   |L 126:1
+1    |PW 4:1     |CW 101:1   |NW 3:1     |L 48:1
+1    |PW 101:1   |CW 3:1     |NW 92:1    |L 126:1
+1    |PW 3:1     |CW 92:1    |NW 90:1    |L 78:1
+1    |PW 92:1    |CW 90:1    |NW 33:1    |L 123:1
+1    |PW 90:1    |CW 33:1    |NW 338:1   |L 126:1
+1    |PW 33:1    |CW 338:1   |NW 15:1    |L 126:1
+1    |PW 338:1   |CW 15:1    |NW 132:1   |L 126:1
+1    |PW 15:1    |CW 132:1   |NW 17:1    |L 126:1
+1    |PW 132:1   |CW 17:1    |NW 72:1    |L 126:1
+1    |PW 17:1    |CW 72:1    |NW 144:1   |L 71:1
+1    |PW 72:1    |CW 144:1   |NW 2:1     |L 119:1
+1    |PW 144:1   |CW 2:1     |NW 2:1     |L 126:1
+```
+where the first column identifies the sequence (sentence) ID, which is the same for all words of the same sentence. There are four input streams: PW, CW, NW, L. 
+The input "PW" represents the previous word ID, "CW" for current word, and "NW" for next word. Input name "L" is for labels. The input names can be anything you 
+like and you can add more input as needed, e.g. words in a bigger window.
+
+Words "BOS" and "EOS" denote beginning of sentence and end of sentences respectively.
+
+Each line above represents one sample (word). E.g. the meaning of this line: `1	|PW 4:1	|CW 101:1	|NW 3:1	|L 48:1`:
+* the sequence ID is 1
+* the current word is "charlotte" whose word ID is 101
+* the previous word is "from" whose ID is 4
+* the next word is "to" whose ID is 3
+* the semantic label is "B-fromloc.city_name" whose label Id is 48.
+
+All word IDs, label IDs and corresponding words and labels are stored in ATIS.vocab and ATIS.labels.
+
+## CNTK Configuration
+
+In this example, we use BrainScript to create one-layer LSTM with embedding for slot tagging. The consolidated config file is ATIS.cntk. One can check the file (with some comments) 
+for details, especially how the reader is configured in ATIS.cntk.
+
+    reader=[
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.train.cntk.sparse" 
+
+        miniBatchMode = "partial" 
+        randomize = true
+        input = [
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$
+                format = "sparse" 
+            ] 
+        ]
+    ]  
+
+The above section tell CNTK to use CNTKTextFormatReader to read data from the file "$DataDir/ATIS.train.cntk.sparse". The same input names (PW, CW, NW, L) are used to refer inputs (features and labels) provided in data files. The input is read into different 
+feature vectors: featuresPW, featuresCW, featuresNW and labels. These vectors are later used to build LSTM node with BrainScript as follows. 
+```
+        featuresPW = Input(inputDim)
+        featuresCW = Input(inputDim)
+        featuresNW = Input(inputDim)
+        features = RowStack(featuresPW : featuresCW : featuresNW)
+        labels=Input(labelDim, tag="label")
+        # embedding layer
+        emb = LearnableParameter(embDim, featDim)
+        featEmbedded = Times(emb, features)
+        # build the LSTM stack
+        lstmDims[i:0..maxLayer] = hiddenDim
+        NoAuxInputHook (input, lstmState) = BS.Constants.None
+        lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims, 
+            cellDims=lstmDims,
+            featEmbedded, 
+            inputDim=embDim,
+            previousHook=BS.RNNs.PreviousHC,
+            augmentInputHook=BS.RNNs.NoAuxInputHook, 
+            augmentInputDim=0,
+            enableSelfStabilization=false)
+        lstmOutputLayer = Length (lstmStack)-1
+        LSTMoutput = lstmStack[lstmOutputLayer].h
+
+```
+A few other notes about the config:
+- it is important to specify the format is "sparse".
+- the gradUpdateType is set FSAdaGrad. This setting reports better model accuracy comparing any other update methods.
+- multiple LSTM layers can be used by changing the value of maxLayer.
+
+Three commands are configured: Train, Output and Test. The command "Train" is used to train a model, "Output" is used to evaluate the model against a test set and store
+the model output, and the command "Test" is to calculate the model's accuracy.
+
+## Run the example
+
+One can run the example locally or on Philly (for Microsoft internal users). 
+
+To run locally,
+
+```sh
+> mkdir work              # the default work_dir
+> open ATIS.cntk and update the value of deviceId: -1 for CPU, auto for GPU
+> cntk.exe configFile=ATIS.cntk
+```
+
+By default, the maxEpochs is set to 1 to save training time. One can change it to larger value such as 20 in order to get a good model accuracy. 
+Depends on GPU, it normally takes about 20 minutes to run 20 epochs on single GPU, and slot F1 score is about 93.
+
+**For Microsoft users only**, to run the job on Philly:
+- first upload data folder to philly cloud. e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy\<your_alias>\ATIS `
+- update the config file to philly cloud, e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy_scratch\<your_alias>\ATIS`
+- go to http://philly/ to create a new job by specifying data folder and config file, and start the job.
+
+More details about Philly, including how to upload data to Philly and start jobs, can be found [here](https://microsoft.sharepoint.com/teams/ATISG/SitePages/Philly%20Users%20Guide.aspx)
--- a/Examples/Text/ATIS/data/ATIS.label
+++ b/Examples/Text/ATIS/data/ATIS.label
@ -0,0 +1,127 @@
+B-aircraft_code
+B-airline_code
+B-airline_name
+B-airport_code
+B-airport_name
+B-arrive_date.date_relative
+B-arrive_date.day_name
+B-arrive_date.day_number
+B-arrive_date.month_name
+B-arrive_date.today_relative
+B-arrive_time.end_time
+B-arrive_time.period_mod
+B-arrive_time.period_of_day
+B-arrive_time.start_time
+B-arrive_time.time
+B-arrive_time.time_relative
+B-booking_class
+B-city_name
+B-class_type
+B-compartment
+B-connect
+B-cost_relative
+B-day_name
+B-day_number
+B-days_code
+B-depart_date.date_relative
+B-depart_date.day_name
+B-depart_date.day_number
+B-depart_date.month_name
+B-depart_date.today_relative
+B-depart_date.year
+B-depart_time.end_time
+B-depart_time.period_mod
+B-depart_time.period_of_day
+B-depart_time.start_time
+B-depart_time.time
+B-depart_time.time_relative
+B-economy
+B-fare_amount
+B-fare_basis_code
+B-flight
+B-flight_days
+B-flight_mod
+B-flight_number
+B-flight_stop
+B-flight_time
+B-fromloc.airport_code
+B-fromloc.airport_name
+B-fromloc.city_name
+B-fromloc.state_code
+B-fromloc.state_name
+B-meal
+B-meal_code
+B-meal_description
+B-mod
+B-month_name
+B-or
+B-period_of_day
+B-restriction_code
+B-return_date.date_relative
+B-return_date.day_name
+B-return_date.day_number
+B-return_date.month_name
+B-return_date.today_relative
+B-return_time.period_mod
+B-return_time.period_of_day
+B-round_trip
+B-state_code
+B-state_name
+B-stoploc.airport_code
+B-stoploc.airport_name
+B-stoploc.city_name
+B-stoploc.state_code
+B-time
+B-time_relative
+B-today_relative
+B-toloc.airport_code
+B-toloc.airport_name
+B-toloc.city_name
+B-toloc.country_name
+B-toloc.state_code
+B-toloc.state_name
+B-transport_type
+I-airline_name
+I-airport_name
+I-arrive_date.day_number
+I-arrive_time.end_time
+I-arrive_time.period_of_day
+I-arrive_time.start_time
+I-arrive_time.time
+I-arrive_time.time_relative
+I-city_name
+I-class_type
+I-cost_relative
+I-depart_date.day_number
+I-depart_date.today_relative
+I-depart_time.end_time
+I-depart_time.period_of_day
+I-depart_time.start_time
+I-depart_time.time
+I-depart_time.time_relative
+I-economy
+I-fare_amount
+I-fare_basis_code
+I-flight_mod
+I-flight_number
+I-flight_stop
+I-flight_time
+I-fromloc.airport_name
+I-fromloc.city_name
+I-fromloc.state_name
+I-meal_code
+I-meal_description
+I-restriction_code
+I-return_date.date_relative
+I-return_date.day_number
+I-return_date.today_relative
+I-round_trip
+I-state_name
+I-stoploc.city_name
+I-time
+I-today_relative
+I-toloc.airport_name
+I-toloc.city_name
+I-toloc.state_name
+I-transport_type
+O
--- a/Examples/Text/ATIS/data/ATIS.test.cntk.sparse
+++ b/Examples/Text/ATIS/data/ATIS.test.cntk.sparse
--- a/Examples/Text/ATIS/data/ATIS.train.cntk.sparse
+++ b/Examples/Text/ATIS/data/ATIS.train.cntk.sparse
--- a/Examples/Text/ATIS/data/ATIS.vocab
+++ b/Examples/Text/ATIS/data/ATIS.vocab
@ -0,0 +1,944 @@
+</s>
+BOS
+EOS
+to
+from
+flights
+the
+on
+what
+me
+flight
+show
+i
+boston
+san
+a
+denver
+in
+and
+francisco
+atlanta
+is
+pittsburgh
+dallas
+all
+baltimore
+list
+philadelphia
+like
+are
+airlines
+of
+between
+that
+washington
+pm
+leaving
+please
+morning
+would
+fly
+for
+city
+fare
+wednesday
+first
+need
+after
+trip
+oakland
+there
+ground
+round
+does
+transportation
+'d
+which
+cheapest
+you
+arriving
+class
+before
+available
+american
+new
+fares
+milwaukee
+with
+give
+have
+afternoon
+york
+st.
+one
+dc
+at
+way
+monday
+leave
+arrive
+airport
+thursday
+how
+want
+tuesday
+nonstop
+find
+am
+earliest
+go
+vegas
+miami
+las
+united
+information
+orlando
+phoenix
+chicago
+sunday
+saturday
+evening
+charlotte
+twenty
+newark
+can
+delta
+toronto
+seattle
+diego
+kansas
+indianapolis
+houston
+airline
+noon
+any
+friday
+lake
+salt
+'s
+next
+us
+o'clock
+cleveland
+continental
+air
+angeles
+los
+august
+worth
+do
+fort
+july
+stop
+code
+5
+seventh
+early
+memphis
+tell
+aircraft
+downtown
+or
+june
+6
+louis
+montreal
+cincinnati
+around
+tomorrow
+cost
+going
+latest
+petersburg
+tampa
+many
+minneapolis
+nashville
+8
+get
+mean
+jose
+detroit
+10
+an
+departing
+stopover
+tacoma
+by
+about
+twa
+much
+7
+leaves
+may
+long
+type
+burbank
+see
+expensive
+ticket
+international
+12
+travel
+could
+dollars
+than
+daily
+columbus
+service
+beach
+'m
+california
+9
+night
+least
+know
+economy
+time
+4
+depart
+into
+meal
+paul
+coach
+book
+april
+airports
+northwest
+la
+lowest
+now
+december
+less
+westchester
+day
+serves
+it
+serve
+november
+okay
+arrives
+used
+field
+love
+last
+ontario
+second
+county
+return
+kind
+september
+mitchell
+general
+as
+stops
+flying
+2
+third
+be
+direct
+fifth
+eighth
+stopping
+times
+breakfast
+out
+make
+capacity
+car
+take
+schedule
+seating
+sixth
+1000
+number
+goes
+cities
+dinner
+connecting
+3
+dl
+fourth
+airfare
+possible
+this
+has
+served
+meals
+ninth
+looking
+also
+restriction
+week
+late
+eastern
+returning
+back
+today
+interested
+price
+business
+most
+prices
+1991
+two
+types
+flies
+twentieth
+will
+through
+limousine
+ua
+bwi
+via
+tenth
+using
+stand
+plane
+ap
+fifteenth
+guardia
+same
+1
+should
+other
+arrangements
+f
+only
+rental
+then
+display
+your
+shortest
+wednesdays
+listing
+canadian
+classes
+again
+numbers
+thirtieth
+florida
+express
+midwest
+tickets
+where
+twelfth
+sixteenth
+h
+north
+eleventh
+carolina
+seventeenth
+under
+smallest
+mco
+distance
+lunch
+either
+makes
+if
+qx
+transport
+far
+hp
+57
+october
+no
+my
+m80
+thank
+arizona
+jfk
+colorado
+jersey
+q
+weekday
+airplane
+y
+planes
+some
+departure
+use
+ewr
+their
+ohio
+thirty
+nineteenth
+when
+fourteenth
+explain
+layover
+alaska
+march
+stopovers
+live
+people
+traveling
+serving
+rent
+hi
+offer
+later
+yes
+january
+area
+logan
+right
+booking
+sfo
+midnight
+yn
+but
+during
+landings
+february
+dfw
+abbreviation
+630
+both
+'re
+230
+qw
+boeing
+coming
+passengers
+arrange
+hours
+qo
+codes
+trying
+tower
+466
+canada
+each
+530
+over
+uses
+arrivals
+11
+southwest
+281
+trips
+838
+days
+those
+takeoffs
+lufthansa
+west
+1100
+arrival
+757
+minnesota
+anywhere
+america
+430
+thrift
+let
+mornings
+nationair
+'ll
+kinds
+cheap
+close
+seats
+pennsylvania
+name
+quebec
+indiana
+michigan
+saturdays
+different
+taxi
+provided
+rates
+utah
+these
+starting
+sometime
+costs
+making
+bh
+eighteenth
+following
+another
+ff
+near
+747
+ea
+1992
+connect
+help
+choices
+sa
+maximum
+wish
+1115
+six
+weekdays
+more
+total
+s
+dc10
+d9s
+2100
+snack
+1245
+georgia
+72s
+73s
+f28
+heading
+departures
+amount
+825
+737
+813
+ap57
+sixteen
+m
+sorry
+serviced
+three
+miles
+departs
+1700
+requesting
+718
+land
+nevada
+100
+so
+tennessee
+tuesdays
+hello
+destination
+reservation
+texas
+rentals
+co
+meaning
+ap80
+1500
+270
+thursdays
+philly
+thirteenth
+services
+sundays
+turboprop
+stands
+415
+provide
+cars
+we
+great
+mondays
+include
+sure
+'t
+well
+2134
+fn
+555
+ord
+934
+connection
+296
+abbreviations
+755
+highest
+hold
+720
+fit
+80
+soon
+four
+ten
+noontime
+too
+offers
+options
+within
+difference
+c
+restrictions
+plan
+originating
+describe
+nw
+1110
+connections
+dulles
+21
+733
+say
+approximately
+define
+852
+1291
+rate
+who
+proper
+beginning
+being
+329
+352
+don
+1024
+such
+wanted
+615
+mealtime
+provides
+prefer
+1288
+257
+across
+continent
+overnight
+local
+route
+746
+off
+j31
+closest
+19
+lax
+l10
+be1
+1994
+red
+eye
+not
+aa
+dca
+determine
+1200
+1205
+dtw
+airfares
+capacities
+200
+town
+lga
+300
+1993
+database
+1765
+eight
+up
+originate
+look
+cp
+carries
+here
+201
+located
+dinnertime
+1039
+lastest
+1222
+they
+just
+d
+limo
+3724
+210
+stapleton
+343
+1145
+schedules
+932
+nonstops
+without
+landing
+b
+midway
+217
+bound
+727
+takeoff
+324
+train
+along
+friends
+transcontinental
+missouri
+reservations
+lives
+767
+269
+ac
+atl
+month
+taking
+repeat
+845
+airplanes
+buy
+still
+itinerary
+actually
+earlier
+various
+reaching
+very
+names
+505
+grounds
+ap68
+must
+kennedy
+operation
+4400
+1201
+297
+question
+combination
+basis
+laying
+1133
+650
+tonight
+43
+ls
+sam
+ap58
+once
+nighttime
+yx
+kw
+212
+1600
+tpa
+prior
+good
+1800
+819
+inform
+k
+dc9
+305
+anything
+771
+459
+calling
+designate
+417
+spend
+hou
+1220
+directly
+jet
+reverse
+staying
+l1011
+belong
+445
+515
+travels
+order
+mci
+150
+110
+connects
+charges
+minimum
+intercontinental
+497766
+sounds
+811
+seat
+final
+phl
+20
+start
+823
+1059
+271
+382
+able
+put
+locate
+hartfield
+scheduled
+run
+225
+1158
+equipment
+begins
+lands
+reaches
+carried
+wn
+bn
+try
+included
+130
+continuing
+india
+lester
+pearson
+listings
+1209
+everywhere
+sd
+whether
+offered
+486
+1300
+950
+usa
+1045
+al
+currently
+enroute
+visit
+them
+takes
+55
+thing
+705
+fridays
+catch
+straight
+advertises
+having
+planning
+listed
+1055
+405
+468
+equal
+working
+sb
+hopefully
+dh8
+symbols
+sort
+cover
+810
+operating
+320
+639
+seventeen
+1207
+608
+besides
+companies
+'ve
+got
+somebody
+else
+wants
+level
+vicinity
+1940
+311
+mia
+instead
+priced
+eleven
+comes
+greatest
+summer
+economic
+bay
+402
+gets
+date
+1020
+730
+400
+doesn
+toward
+home
+1850
+1505
+runs
+673
+723
+thanks
+bring
+zone
+yyz
+afternoons
+non
+largest
+500
+come
+428
+98
+qualify
+279
+137338
+d10
+539
+fine
+while
+665
+concerning
+iah
+1230
+oak
+preferably
+twelve
+3357
+323
+nights
+229
+regarding
+seven
+inexpensive
+420
+416
+repeating
+scenario
+139
+82
+kindly
+limousines
+345
+afterwards
+734
+place
+includes
+106
+1026
+124
+fifteen
+bna
+supper
+oh
+71
+thereafter
+2153
+year
+discount
+1130
+1030
+world
+trans
+including
+represented
+o
+'hare
+exceeding
+815
+928
+163
+bur
+419
+cvg
+1017
+315
+842
+1083
+0900
+longest
+called
+snacks
+645
+ever
+single
--- a/131
+++ b/131
@ -71,7 +71,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
 COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
 CPPFLAGS:= 
-CXXFLAGS:= -msse3 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
+CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS:=
 LDFLAGS:=
@ -375,6 +375,8 @@ CNTKLIBRARY_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
+    $(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
+

 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -437,7 +439,7 @@ EVAL_SRC=\
 	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
 	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
 	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
-	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp 
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \

 EVAL_SRC+=$(SGDLIB_SRC)
 EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -814,6 +816,127 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
 	@echo bin-placing deployable resource files
 	cp -f $^ $@

+########################################
+# Unit Tests
+########################################
+
+# use system pre-installed Boost libraries
+# Todo: use our own version of boost libraries 
+BOOSTLIB_PATH = /usr/lib/x86_64-linux-gnu
+BOOSTLIBS := boost_unit_test_framework boost_filesystem boost_system
+
+UNITTEST_EVAL_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
+
+UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
+
+UNITTEST_EVAL := $(BINDIR)/evaltests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_EVAL)
+#SRC += $(UNITTEST_EVAL_SRC)
+
+$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(EVAL) -l$(CNTKMATH) 
+
+#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
+INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
+
+UNITTEST_READER_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
+
+UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
+
+UNITTEST_READER := $(BINDIR)/readertests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_READER)
+#SRC += $(UNITTEST_READER_SRC)
+
+$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS))  -l$(CNTKMATH) 
+
+UNITTEST_NETWORK_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
+	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
+	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
+	$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
+	$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
+
+UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
+UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
+
+UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
+
+UNITTEST_NETWORK := $(BINDIR)/networktests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_NETWORK)
+#SRC += $(UNITTEST_NETWORK_SRC)
+
+$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(CNTKMATH) $(LIBS) 
+
+UNITTEST_MATH_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
+
+UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
+
+UNITTEST_MATH := $(BINDIR)/mathtests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_MATH)
+#SRC += $(UNITTEST_MATH_SRC)
+
+$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB) 
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) $(LIBS) -l$(CNTKMATH) 
+
+unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH)
+
+
 ########################################
 # General compile and dependency rules
 ########################################
@ -838,13 +961,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
 	@mkdir -p $(dir $@)
 	$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"

-$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
+$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION) 
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
 	$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-.PHONY: clean buildall all
+.PHONY: clean buildall all unittests

 clean:
 	@echo $(SEPARATOR)
--- a/README.md
+++ b/README.md
@ -1,6 +1,11 @@
 # CNTK

 ## Latest news
+*2016-07-15.* V 1.6 Binary release  
+CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
+
+*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
+
 *2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).

 *2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.  
@ -8,12 +13,6 @@ Effective immediately the License Terms for CNTK 1bit-SGD and related components

 *2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)

-*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries.  
-NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
-
-*2016-06-15.*  CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
-See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Scripts/README.md
+++ b/Scripts/README.md
@ -0,0 +1,24 @@
+This directory contains different script helping using different components of CNTK.
+
+### CNTK Text format Converters
+Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
+```
+txt2ctf.py 
+```
+Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example. 
+
+```
+uci2ctf.py
+```
+Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
+```
+python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10  --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
+```
+```input_file``` – original dataset in the (columnar) UCI format
+```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
+```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
+```labels_start``` - index of the first label column
+```labels_dim``` – number of label columns
+```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
+```output_file``` – path and filename of the resulting dataset.
+
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -191,6 +191,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(KhatriRaoProductNode), L"ColumnwiseCrossProduct")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(LogPlusNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogSoftmaxNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogisticNode), L"Logistic")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LookupTableNode))) ret = true;
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@ -53,7 +53,6 @@ public:

 __declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
 {
-    //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
    throw EvaluationException(msg, where);
 }

--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@ -89,9 +89,18 @@ struct Issue
 // Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
 // Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
 /*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
+{
+    wstring error = CreateIssueMessage(locations, errorKind, kind, what);
+    fprintf(stderr, "%ls", error.c_str());
+    fflush(stderr);
+}
+
+/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
 {
    vector<Issue> issues; // tracing the error backwards
    size_t symbolIndex = 0;
+    wstring message;
+
    for (size_t n = 0; n < locations.size(); n++)
    {
        let& location = locations[n];
@ -125,20 +134,23 @@ struct Issue
    if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
    {
        let& firstLoc = issues.front().location;
-        fprintf(stderr, "[CALL STACK]\n");
+        message += wstrprintf(L"[CALL STACK]\n");
        for (auto i = issues.rbegin(); i != issues.rend(); i++)
        {
            let& issue = *i;
            auto& where = issue.location;
            const auto& lines = where.GetSourceFile().lines;
            const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
-            fprintf(stderr, "  %ls\n  %ls\n", line, issue.markup.c_str());
+            message += wstrprintf(L"  %ls\n  %ls\n", line, issue.markup.c_str());
        }
-        fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
+        message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
    }
    else
-        fprintf(stderr, "%ls while %ls", errorKind, kind);
-    fprintf(stderr, ": %ls\n", what), fflush(stderr);
+    {
+        message += wstrprintf(L"%ls while %ls", errorKind, kind);
+    }
+    message += wstrprintf(L": %ls\n", what);
+    return message;
 }
 /*static*/ vector<SourceFile> TextLocation::sourceFileMap;

--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca

    // helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
    static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
+    static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
    static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);

    // construction
@ -77,8 +78,12 @@ public:
    }                                        // where the error happened
    virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"

+    wstring GetError(const std::wstring& linePrefix) const override
+    {
+        return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
+    }
    // pretty-print this as an error message
-    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
+    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
    {
        TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
    }
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -18,6 +18,7 @@
 #include "NDLNetworkBuilder.h"
 #include "ModelEditLanguage.h"
 #include "CPUMatrix.h" // used for SetNumThreads()
+#include "GPUMatrix.h" // used for SyncGuard::EnableSync()
 #include "CommonMatrix.h"
 #include "SGD.h"
 #include "MPIWrapper.h"
@ -117,6 +118,23 @@ size_t GetMaxEpochs(const ConfigParameters& configParams)
    return maxEpochs;
 }

+#ifndef CPUONLY
+// abort execution is GPU is not supported (e.g. compute capability not supported)
+void CheckSupportForGpu(DEVICEID_TYPE deviceId)
+{
+    auto gpuData = GetGpuData(deviceId);
+    if (gpuData.validity == GpuValidity::ComputeCapabilityNotSupported)
+    {
+        InvalidArgument("CNTK: The GPU (%s) has compute capability %d.%d.  CNTK is only supported on GPUs with compute capability 3.0 or greater", 
+                        gpuData.name.c_str(), gpuData.versionMajor, gpuData.versionMinor);
+    }
+    else if (gpuData.validity == GpuValidity::UnknownDevice)
+    {
+        InvalidArgument("CNTK: Unknown GPU with Device ID %d.", gpuData.deviceId);
+    }
+}
+#endif
+
 // special temporary function to guard against a now invalid usage of "truncated" which exists in some IPG production setups
 static void DisableLegacyTruncationSettings(const ConfigParameters& TopLevelConfig, const ConfigParameters& commandConfig)
 {
@ -373,6 +391,30 @@ void PrintUsageInfo()
    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
 }

+// print gpu info for current gpu devices (e.g. Device[0]: cores = 2496; computeCapability = 5.2; type = "Quadro M4000"; memory = 8192 MB)
+void PrintGpuInfo()
+{
+#ifndef CPUONLY
+    std::vector<GpuData> gpusData = GetAllGpusData();
+
+    if (gpusData.empty())
+    {
+        LOGPRINTF(stderr, "No GPUs found\n");
+        return;
+    }
+
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "GPU info:\n\n");
+
+    for (GpuData& data : gpusData)
+    {
+        LOGPRINTF(stderr, "\t\tDevice[%d]: cores = %d; computeCapability = %d.%d; type = \"%s\"; memory = %lu MB\n",
+                  data.deviceId, data.cudaCores, data.versionMajor, data.versionMinor, data.name.c_str(), data.totalMemory);
+    }
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+#endif
+}
+
 // ---------------------------------------------------------------------------
 // main() for use with BrainScript
 // ---------------------------------------------------------------------------
@ -464,6 +506,21 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    let valp = BS::Evaluate(expr);                                // evaluate parse into a dictionary
    let& config = valp.AsRef<ScriptableObjects::IConfigRecord>(); // this is the dictionary

+#ifndef CPUONLY
+    auto valpp = config.Find(L"deviceId");
+    if (valpp)
+    {
+        auto valp = *valpp;
+        if (!valp.Is<ScriptableObjects::String>()) // if it's not string 'auto' or 'cpu', then it's a gpu
+        {
+            if (static_cast<int>(valp) >= 0) // gpu (id >= 0)
+            {
+                CheckSupportForGpu(valp); // throws if gpu is not supported
+            }
+        }
+    }
+#endif
+
    // legacy parameters that have changed spelling
    if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
        InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
@ -485,6 +542,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

+    bool synchronizeCUDAKernelExecutions = config(L"synchronizeCUDAKernelExecutions", false);
+    if (synchronizeCUDAKernelExecutions)
+        SyncGuard::EnableSync();
+
    // logging
    wstring logpath = config(L"stderr", L"");
    if (logpath != L"")
@ -502,6 +563,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    // echo config info to log
    PrintBuiltInfo();

+    // echo gpu info to log
+    PrintGpuInfo();
+
    // execute the actions
    // std::string type = config(L"precision", "float");
    int numCPUThreads = config(L"numCPUThreads", 0);
@ -559,6 +623,18 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
 {
    ConfigParameters config;
    std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);    // get the command param set they want
+
+#ifndef CPUONLY
+    ConfigValue val = config("deviceId", "auto");
+    if (!EqualCI(val, "cpu") && !EqualCI(val, "auto"))
+    {
+        if (static_cast<int>(val) >= 0) // gpu (id >= 0)
+        {
+            CheckSupportForGpu(static_cast<int>(val)); // throws if gpu is not supported
+        }
+    }
+#endif
+
    bool timestamping = config(L"timestamping", false);
    if (timestamping)
    {
@ -602,6 +678,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    }

    PrintBuiltInfo(); // this one goes to log file
+    PrintGpuInfo();
+
    std::string timestamp = TimeDateStamp();

    // dump config info
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -144,6 +144,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\CrossProcessMutex.h" />
+    <ClInclude Include="..\Common\Include\basetypes.h" />
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
@ -222,4 +223,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -47,6 +47,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template <typename ElementType>
    class ComputationNode;
+
+    class File;
 }}}

 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -135,352 +137,30 @@ namespace CNTK
    // Forward declarations
    class CompositeFunction;
    class Function;
+    class Variable;

-    namespace Internal
+    // Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
+    // that objects are deleted on the same side of the library DLL where they are allocated
+    template <typename T, typename ...CtorArgTypes>
+    inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs)
    {
-        //  A reference count to be used as the base class for all reference counted types.
-        class CNTK_API ReferenceCount
-        {
-        public:
-
-            ReferenceCount();
-            virtual ~ReferenceCount();
-
-            size_t AddReference();
-            size_t RemoveReference();
-            size_t GetReferenceCount();
-
-        private:
-            std::atomic<size_t>* m_rc;
-        };
-
-        // A smart pointer to a reference counted object
-        // T must be a type derived from ReferenceCount
-        template <class T>
-        class CNTK_API ReferenceCountedPtr final
-        {
-            typedef void(*ReferenceCountedObjectDeleter)(ReferenceCount* obj);
-
-        public:
-
-            ReferenceCountedPtr(T* ptr = nullptr, ReferenceCountedObjectDeleter deleter = nullptr) : m_objPtr(ptr), m_deleter(deleter)
-            {
-                AddReferenceIfNeeded();
-            }
-
-            ReferenceCountedPtr(const ReferenceCountedPtr& other) : m_objPtr(nullptr), m_deleter(nullptr)
-            {
-                *this = other;
-            }
-
-            ReferenceCountedPtr(ReferenceCountedPtr&& other) : m_objPtr(nullptr), m_deleter(nullptr)
-            {
-                *this = std::move(other);
-            }
-
-            ~ReferenceCountedPtr()
-            {
-                DeleteReferenceIfNeeded(m_objPtr, m_deleter);
-            }
-
-            ReferenceCountedPtr& operator=(const ReferenceCountedPtr& other)
-            {
-                if (this != &other)
-                {
-                    T* oldPtr = m_objPtr;
-                    ReferenceCountedObjectDeleter oldDeleter = m_deleter;
-
-                    m_objPtr = other.m_objPtr;
-                    m_deleter = other.m_deleter;
-                    AddReferenceIfNeeded();
-
-                    DeleteReferenceIfNeeded(oldPtr, oldDeleter);
-                }
-
-                return *this;
-            }
-
-            ReferenceCountedPtr& operator=(ReferenceCountedPtr&& other)
-            {
-                assert(this != &other);
-
-                T* oldPtr = m_objPtr;
-                ReferenceCountedObjectDeleter oldDeleter = m_deleter;
-
-                m_objPtr = other.m_objPtr;
-                m_deleter = other.m_deleter;
-                // No change to ref-count of the adopted pointer.
-
-                other.m_objPtr = nullptr;
-                other.m_deleter = nullptr;
-
-                DeleteReferenceIfNeeded(oldPtr, oldDeleter);
-
-                return *this;
-            }
-
-            // Conversion to a ReferenceCountedSharedPtr instance of a base type
-            template <typename Base, typename std::enable_if<std::is_base_of<Base, T>::value>::type* = nullptr>
-            operator ReferenceCountedPtr<Base>()
-            {
-                return ReferenceCountedPtr<Base>(m_objPtr, m_deleter);
-            }
-
-            T* operator->() const
-            {
-                return m_objPtr;
-            }
-
-            T& operator*() const
-            {
-                return *m_objPtr;
-            }
-
-            operator T*() const
-            {
-                return m_objPtr;
-            }
-
-            T* GetPtr() const
-            {
-                return m_objPtr;
-            }
-
-        private:
-            void AddReferenceIfNeeded()
-            {
-                static_assert(std::is_base_of<ReferenceCount, T>::value, "ReferenceCountedPtr<T> can only be used when ReferenceCount is a base type of T!");
-
-                if (m_objPtr != nullptr)
-                    reinterpret_cast<ReferenceCount*>(m_objPtr)->AddReference();
-            }
-
-            static void DeleteReferenceIfNeeded(T* objPtr, ReferenceCountedObjectDeleter deleter)
-            {
-                static_assert(std::is_base_of<ReferenceCount, T>::value, "ReferenceCountedPtr<T> can only be used when ReferenceCount is a base type of T!");
-
-                if (objPtr != nullptr)
-                {
-                    size_t refCountRemaining = reinterpret_cast<ReferenceCount*>(objPtr)->RemoveReference();
-                    if (refCountRemaining == 0)
-                    {
-                        if (deleter != nullptr)
-                            deleter(reinterpret_cast<ReferenceCount*>(objPtr));
-                        else
-                            delete objPtr;
-                    }
-                }
-            }
-
-        private:
-            T* m_objPtr;
-            ReferenceCountedObjectDeleter m_deleter;
-        };
-
-        template <typename T>
-        bool operator==(const ReferenceCountedPtr<T>& first, const ReferenceCountedPtr<T>& second)
-        {
-            return first.GetPtr() == second.GetPtr();
-        }
-
-        // A wrapper around the STL vector implementation with a safe ABI to allow usage across the library DLL boundary
-        // as STL vectors cannot be used across the DLL boundary
-        template <typename T>
-        class CNTK_API SimpleVector final
-        {
-            template <typename ValueType>
-            friend CNTK_API bool operator==(const SimpleVector<ValueType>& first, const SimpleVector<ValueType>& second);
-
-            friend class CNTK::Function;
-
-        public:
-            SimpleVector();
-
-            template <typename ContainerType, typename std::enable_if<std::is_same<ContainerType, std::vector<T>>::value ||
-                                                                      std::is_same<ContainerType, std::initializer_list<T>>::value ||
-                                                                      std::is_same<ContainerType, std::array<T, sizeof(ContainerType) / sizeof(T)>>::value>::type* = nullptr>
-            SimpleVector(const ContainerType& initList)
-                : SimpleVector(initList.size())
-            {
-                std::copy(initList.begin(), initList.end(), Data());
-            }
-
-            SimpleVector(size_t numElements, const T& initVal = T());
-            ~SimpleVector();
-
-            SimpleVector(const SimpleVector& other);
-            SimpleVector& operator=(const SimpleVector& other);
-
-            SimpleVector(SimpleVector&& other);
-            SimpleVector& operator=(SimpleVector&& other);
-
-            T& operator[](size_t idx);
-            const T& operator[](size_t idx) const;
-
-            size_t Size() const;
-
-            T* Data();
-            const T* Data() const;
-
-            void PushBack(const T& value);
-            void PushBack(T&& value);
-
-            operator std::vector<T>() const
-            {
-                std::vector<T> retVector(Size());
-                for (size_t i = 0; i < Size(); ++i)
-                    retVector[i] = this->operator[](i);
-
-                return retVector;
-            }
-
-            std::unordered_set<T> GetAsUnorderedSet(bool ensureUnique = true)
-            {
-                std::unordered_set<T> retSet;
-                for (size_t i = 0; i < Size(); ++i)
-                {
-                    auto insertRet = retSet.insert(this->operator[](i));
-                    if (ensureUnique && !insertRet.second)
-                        RuntimeError("A SimpleVector with duplicate elements cannot be converted to an unordered_set");
-                }
-
-                return retSet;
-            }
-
-        private:
-            std::vector<T>* m_vector;
-        };
-
-        template <typename ValueType>
-        CNTK_API bool operator==(const SimpleVector<ValueType>& first, const SimpleVector<ValueType>& second);
-
-        template <typename ValueType>
-        bool operator!=(const SimpleVector<ValueType>& first, const SimpleVector<ValueType>& second)
-        {
-            return !(first == second);
-        }
-
-        // A wrapper around the STL set implementation with a safe ABI to allow usage across the library DLL boundary
-        // as STL sets cannot be used across the DLL boundary
-        template <typename KeyType>
-        class CNTK_API SimpleSet final
-        {
-            friend class CNTK::CompositeFunction;
-
-            template <typename T>
-            friend CNTK_API bool operator==(const SimpleSet<T>& first, const SimpleSet<T>& second);
-
-        public:
-            SimpleSet();
-            ~SimpleSet();
-
-            SimpleSet(const SimpleSet& other);
-            SimpleSet& operator=(const SimpleSet& other);
-
-            SimpleSet(SimpleSet&& other);
-            SimpleSet& operator=(SimpleSet&& other);
-
-            bool Insert(const KeyType& key);
-            bool Contains(const KeyType& key) const;
-
-            size_t Size() const;
-
-            operator SimpleVector<KeyType>() const;
-
-            operator std::unordered_set<KeyType>() const
-            {
-                return ((SimpleVector<KeyType>)(*this)).GetAsUnorderedSet();
-            }
-
-            static SimpleSet<KeyType> CreateSimpleSet(const std::unordered_set<KeyType>& initSet)
-            {
-                SimpleSet<KeyType> simpleSet;
-                for (auto key : initSet)
-                    simpleSet.Insert(key);
-
-                return simpleSet;
-            }
-
-        private:
-            std::unordered_set<KeyType>* m_set;
-        };
-
-        template <typename KeyType>
-        CNTK_API bool operator==(const SimpleSet<KeyType>& first, const SimpleSet<KeyType>& second);
-
-        template <typename KeyType>
-        bool operator!=(const SimpleSet<KeyType>& first, const SimpleSet<KeyType>& second)
-        {
-            return !(first == second);
-        }
-
-        // A wrapper aroound the STL map implementation with a safe ABI to allow usage across the library DLL boundary
-        // as STL maps cannot be used across the DLL boundary
-        template <typename KeyType, typename ValueType>
-        class CNTK_API SimpleMap final
-        {
-            friend class CNTK::CompositeFunction;
-            friend class CNTK::Function;
-
-        public:
-            SimpleMap();
-            ~SimpleMap();
-
-            SimpleMap(const SimpleMap& other);
-            SimpleMap& operator=(const SimpleMap& other);
-
-            SimpleMap(SimpleMap&& other);
-            SimpleMap& operator=(SimpleMap&& other);
-
-            ValueType& operator[](const KeyType& key);
-            const ValueType& operator[](const KeyType& key) const;
-
-            bool Insert(const KeyType& key, const ValueType& value);
-            bool Contains(const KeyType& key) const;
-            size_t Size() const;
-
-            SimpleSet<KeyType> Keys() const;
-
-            static SimpleMap<KeyType, ValueType> CreateSimpleMap(const std::unordered_map<KeyType, ValueType>& initMap)
-            {
-                SimpleMap<KeyType, ValueType> simpleMap;
-                for (auto keyValuePair : initMap)
-                    simpleMap.Insert(keyValuePair.first, keyValuePair.second);
-
-                return simpleMap;
-            }
-
-        private:
-            std::unordered_map<KeyType, ValueType>* m_map;
-        };
+        auto objPtr = new T(std::forward<CtorArgTypes>(ctorArgs)...);
+        return std::shared_ptr<T>(objPtr, [](T* ptr) { delete ptr; });
    }

    // Forward declarations
    class NDArrayView;
-    typedef Internal::ReferenceCountedPtr<NDArrayView> NDArrayViewPtr;
+    typedef std::shared_ptr<NDArrayView> NDArrayViewPtr;

    class NDMask;
-    typedef Internal::ReferenceCountedPtr<NDMask> NDMaskPtr;
+    typedef std::shared_ptr<NDMask> NDMaskPtr;

    class Value;
-    typedef Internal::ReferenceCountedPtr<Value> ValuePtr;
+    typedef std::shared_ptr<Value> ValuePtr;

    class Function;
-    typedef Internal::ReferenceCountedPtr<Function> FunctionPtr;
+    typedef std::shared_ptr<Function> FunctionPtr;

-    namespace Internal
-    {
-        CNTK_API FunctionPtr Combine(const Internal::SimpleVector<FunctionPtr>& operands, const std::wstring& name = L"");
-    }
-}
-
-namespace std {
-    template <typename T>
-    struct hash<CNTK::Internal::ReferenceCountedPtr<T>>
-    {
-        size_t operator()(const CNTK::Internal::ReferenceCountedPtr<T>& x) const
-        {
-            return std::hash<const void*>()(x.GetPtr());
-        }
-    };
+    class Learner;
+    typedef std::shared_ptr<Learner> LearnerPtr;
 }
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -128,6 +128,7 @@
    <ClInclude Include="API\CNTKLibrary.h" />
    <ClInclude Include="API\CNTKLibraryInternals.h" />
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
    <ClInclude Include="Utils.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
@ -140,6 +141,7 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
+    <ClCompile Include="Learner.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
    <ClCompile Include="NDMask.cpp" />
    <ClCompile Include="stdafx.cpp">
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -10,6 +10,7 @@
    <ClCompile Include="Variable.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="NDMask.cpp" />
+    <ClCompile Include="Learner.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
@ -22,6 +23,7 @@
      <Filter>API</Filter>
    </ClInclude>
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -14,7 +14,21 @@ namespace CNTK
        return GPUDevice(0);
    }

-    /*static*/ Axis Axis::DefaultDynamicAxis = Axis(L"defaultDynamicAxis");
-    /*static*/ Axis Axis::BatchAxis = Axis(L"batchAxis");
-    /*static*/ Axis Axis::AllAxes = Axis(L"allAxes");
+    /*static*/ const Axis& Axis::DefaultDynamicAxis()
+    {
+        static Axis s_defaultDynamicAxis(L"defaultDynamicAxis");
+        return s_defaultDynamicAxis;
+    }
+
+    /*static*/ const Axis& Axis::BatchAxis()
+    {
+        static Axis s_batchAxis(L"batchAxis");
+        return s_batchAxis;
+    }
+
+    /*static*/ const Axis& Axis::AllAxes()
+    {
+        static Axis s_allAxes(L"allAxes");
+        return s_allAxes;
+    }
 }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -17,34 +17,82 @@ bool g_shareNodeValueMatrices = true;

 namespace CNTK
 {
-    Internal::SimpleVector<Variable> Function::InputsImpl() const
+    std::shared_ptr<std::vector<Variable>> Function::InputsImpl() const
    {
        const CompositeFunction* compositeFunction = dynamic_cast<const CompositeFunction*>(this);
+        std::vector<Variable> inputs;
        if (compositeFunction == nullptr)
-            return m_inputs;
+            inputs = m_inputs;
        else
-            return Internal::SimpleVector<Variable>(compositeFunction->DetermineInputs());
+            inputs = compositeFunction->DetermineInputs();
+
+        return std::shared_ptr<std::vector<Variable>>(new std::vector<Variable>(std::move(inputs)), [](std::vector<Variable>* ptr) { delete ptr; });
    }

-    /*virtual*/ void Function::_ReplacePlaceholders(const Internal::SimpleMap<Placeholder, Variable>& placeholderReplacements,
-                                                    Internal::SimpleSet<const Function*>& visitedFunctions,
-                                                    Internal::SimpleSet<Placeholder>& replacedPlaceholders)
+    FunctionPtr Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements)
    {
-        visitedFunctions.Insert(this);
+        // Cannot be called on primitive functions
+        if (RootFunction() == nullptr)
+            InvalidArgument("ReplacePlaceholders should never be called on primitive functions");

-        for (auto& inputVar : *(m_inputs.m_vector))
+        std::unordered_set<const Function*> visitedFunctions;
+        std::unordered_set<Placeholder> replacedPlaceholders;
+        ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+
+        for (auto replacementPair : placeholderReplacements)
+        {
+            if (replacedPlaceholders.find(replacementPair.first) == replacedPlaceholders.end())
+                InvalidArgument("At least one of the placeholders specified for replacement was not found in the function");
+        }
+
+        return this->shared_from_this();
+    }
+
+    // Placeholders can be replaced incrementally - i.e. not all placeholders need to replaced in one go.
+    // The only requirement is that they must all be replaced before making any 'Forward' calls on the Function instance.
+    /*virtual*/ void Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                                   std::unordered_set<const Function*>& visitedFunctions,
+                                                   std::unordered_set<Placeholder>& replacedPlaceholders)
+    {
+        visitedFunctions.insert(this);
+
+        for (auto& inputVar : m_inputs)
        {
            if (inputVar.IsPlaceholder())
            {
                Placeholder placeholder(inputVar);
-                if (placeholderReplacements.Contains(placeholder))
+                if (placeholderReplacements.find(placeholder) != placeholderReplacements.end())
                {
-                    inputVar = placeholderReplacements[placeholder];
-                    replacedPlaceholders.Insert(placeholder);
+                    inputVar = placeholderReplacements.at(placeholder);
+                    replacedPlaceholders.insert(placeholder);
                }
            }
-            else if (inputVar.IsOutput() && !visitedFunctions.Contains(inputVar.Owner()))
-                inputVar.Owner()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+            else if (inputVar.IsOutput() && (visitedFunctions.find(inputVar.Owner().get()) == visitedFunctions.end()))
+                inputVar.Owner()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+        }
+    }
+
+    // Replace any PlaceHolder Variables in the graph of Functions underlying 'this' CompositeFunction. All PlaceHolder variables
+    // should have been replaced before performing any Forward compute of 'this' Function.
+    /*virtual*/ void CompositeFunction::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                                            std::unordered_set<const Function*>& visitedFunctions,
+                                                            std::unordered_set<Placeholder>& replacedPlaceholders)
+    {
+        RootFunction()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+
+        // If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
+        for (auto replacedPlaceholder : replacedPlaceholders)
+        {
+            auto replacingVariable = placeholderReplacements.at(replacedPlaceholder);
+            if (replacingVariable.IsOutput())
+            {
+                auto ownerFunc = replacingVariable.Owner();
+                std::unordered_set<FunctionPtr> visitedFunctions;
+                DetermineInputs(ownerFunc, visitedFunctions);
+
+                // Add the newly visited functions to 'm_allPrimitiveFunctions' set
+                m_allPrimitiveFunctions.insert(visitedFunctions.begin(), visitedFunctions.end());
+            }
        }
    }

@ -79,7 +127,7 @@ namespace CNTK
        else if (variable.IsInput())
        {
            // TODO: Specify dynamic axis
-            if (variable.IsSparseInput())
+            if (IsSparseInput(variable))
                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
            else
                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()));
@ -111,7 +159,7 @@ namespace CNTK
    {
        assert(variable.IsOutput());

-        Function* function = variable.Owner();
+        Function* function = variable.Owner().get();
        ComputationNodeBasePtr computationNodePtr;
        if (dynamic_cast<PrimitiveFunction*>(function))
        {
@ -222,14 +270,14 @@ namespace CNTK
    }

    template <typename ElementType>
-    ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const Internal::SimpleSet<Variable>& backpropRoots)
+    ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots)
    {
        if (m_computationNetwork != nullptr)
        {
            // TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
            // was last constructed, to just recreate a new network.
            // For now just disallow changing the backpropRoots after the network is created
-            if (m_currentBackpropRoots != *backpropRoots.m_set)
+            if (m_currentBackpropRoots != backpropRoots)
                LogicError("Changing backprop roots across different Forward calls on a CNTK composite Function is currently unsupported");

            // TODO: Support changing the device across different invocations of the forward method on a Function instance
@ -244,7 +292,7 @@ namespace CNTK
            ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);

            // TODO: We current only support one backprop root
-            if (backpropRoots.Size() > 1)
+            if (backpropRoots.size() > 1)
                LogicError("More than one backprop roots is currently unsupported");

            ComputationNodeBasePtr backpropRootNode;
@ -258,7 +306,7 @@ namespace CNTK
                auto currentRootNode = GetNode(rootOutput, m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
                forwardRootNodes.push_back(currentRootNode);

-                if (backpropRoots.Contains(rootOutput))
+                if (backpropRoots.find(rootOutput) != backpropRoots.end())
                    backpropRootNode = m_variableToNodeMap[rootOutput];
            }

@ -281,7 +329,7 @@ namespace CNTK
                if (std::find(currentComputationNodeInputs.begin(), currentComputationNodeInputs.end(), nullptr) != currentComputationNodeInputs.end())
                {
                    // We found a null input; this variable must correspond to a PastValue or FutureValue function
-                    const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(varNodePair.first.Owner().GetPtr());
+                    const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(varNodePair.first.Owner().get());
                    if ((primitiveFunc == nullptr) || ((primitiveFunc->OpType() != PrimitiveOpType::PastValue) && (primitiveFunc->OpType() != PrimitiveOpType::FutureValue)))
                        InvalidArgument("Invalid Function graph detected; recurrence found at a Function that is not a PastValue/FutureValue function");

@ -326,10 +374,10 @@ namespace CNTK
            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->Data()->GetDataType()));

        // TODO: Is supplying dense data for an Input variable tagged as sparse, a fatal error?
-        if (var.IsSparseInput() && !value->Data()->IsSparse())
+        if (IsSparseInput(var) && !value->Data()->IsSparse())
            InvalidArgument("Dense input data supplied for a sparse input Variable");

-        if (var.IsSparseInput() && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
+        if (IsSparseInput(var) && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
            InvalidArgument("Sparse Input data must be in SparseCSC format");

        if (value->Data()->Shape().NumAxes() == var.Shape().NumAxes())
@ -413,7 +461,7 @@ namespace CNTK
                                                                    layout->GetNumCols(),
                                                                    AsCNTKImplDeviceId(value->Data()->Device()),
                                                                    value->Data()->IsSparse() ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                    AsCNTKMatrixFormat(value->Data()->GetStorageFormat()));
+                                                                    AsCNTKImplMatrixFormat(value->Data()->GetStorageFormat()));

            std::vector<size_t> sequencesShorterThanLongestSequence;
            for (size_t i = 0; i < numSequences; ++i)
@ -458,8 +506,8 @@ namespace CNTK
        {
            // Just create a view over the existing matrix itself
            auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
-            auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView), [](ReferenceCount* ptr) { delete ptr; });
-            return ValuePtr(new Value(data), [](ReferenceCount* ptr) { delete ptr; });
+            auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
+            return MakeSharedObject<Value>(data);
        }

        if (layout->GetNumCols() != matrix.GetNumCols())
@ -509,7 +557,7 @@ namespace CNTK
        NDMaskPtr mask;
        if (!sequencesShorterThanLongestSequence.empty())
        {
-            mask = NDMaskPtr(new NDMask({ maxNumTimeSteps, numSequences }, AsDeviceDescriptor(matrix.GetDeviceId())), [](ReferenceCount* ptr) { delete ptr; });
+            mask = MakeSharedObject<NDMask>(NDShape({ maxNumTimeSteps, numSequences }), AsDeviceDescriptor(matrix.GetDeviceId()));
            for (auto shortSequenceIdx : sequencesShorterThanLongestSequence)
            {
                mask->MaskSection({ sequenceLengths[shortSequenceIdx], shortSequenceIdx }, { NDShape::InferredDimension, 1 });
@ -517,97 +565,89 @@ namespace CNTK
        }

        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
-        auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView), [](ReferenceCount* ptr) { delete ptr; });
-        return ValuePtr(new Value(data, mask), [](ReferenceCount* ptr) { delete ptr; });
+        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
+        return MakeSharedObject<Value>(data, mask);
    }

-    void CompositeFunction::PopulateNetworkInputs(const Internal::SimpleMap<Variable, const ValuePtr>& arguments)
+    template <typename ElementType>
+    /*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
+    {
+        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableValue.first, variableValue.second);
+        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
+
+        auto& nodeData = computationNode->As<ComputationNode<ElementType>>()->Value();
+
+        // Switch the node matrix to the right matrix type
+        nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
+        nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+        computationNode->GetMBLayout()->CopyFrom(layout);
+    }
+
+    void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
    {
        auto functionArguments = this->Arguments();
        std::vector<ComputationNodeBasePtr> inputNodes;
        for (auto argument : functionArguments)
        {
            // Ensure we have values for all arguments of the function
-            if (!arguments.Contains(argument))
+            if (arguments.find(argument) == arguments.end())
                InvalidArgument("Value not specified for required Function Argument");

            auto argumentComputationNode = m_variableToNodeMap[argument];
            inputNodes.push_back(argumentComputationNode);

-            ValuePtr argumentValue = arguments[argument];
+            ValuePtr argumentValue = arguments.at(argument);

            MBLayoutPtr layout;
            switch (argumentValue->Data()->GetDataType())
            {
            case DataType::Float:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(argument, argumentValue);
-                layout = CNTKMatrixAndMBLayout.second;
-
-                auto& nodeData = argumentComputationNode->As<ComputationNode<float>>()->Value();
-                // Switch the node matrix to the right matrix type
-                nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
-                nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeValue<float>({ argument, argumentValue }, argumentComputationNode);
                break;
-            }
            case DataType::Double:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(argument, argumentValue);
-                layout = CNTKMatrixAndMBLayout.second;
-
-                auto& nodeData = argumentComputationNode->As<ComputationNode<double>>()->Value();
-                // Switch the node matrix to the right matrix type
-                nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
-                nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode);
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(argumentValue->Data()->GetDataType()));
                break;
            }
-
-            argumentComputationNode->GetMBLayout()->CopyFrom(layout);
        }

        m_computationNetwork->BumpEvalTimeStamp(inputNodes);
    }

-    void CompositeFunction::PopulateNetworkGradients(const Internal::SimpleMap<Variable, const ValuePtr>& gradients)
+    template <typename ElementType>
+    /*static*/ void CompositeFunction::PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode)
+    {
+        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableGradient.first, variableGradient.second);
+        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
+        auto nodeLayout = computationNode->GetMBLayout();
+        if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
+            InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
+        computationNode->As<ComputationNode<ElementType>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
+    }
+
+    // Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
+    void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
    {
        auto functionOutputs = this->Outputs();
-        std::unordered_map<Variable, const ValuePtr>& gradientsValueMap = *gradients.m_map;
-        for (auto gradientVarValuePair : gradientsValueMap)
+        for (auto gradientVarValuePair : gradients)
        {
            // Only gradients for roots of the function can be specified
            if (std::find(functionOutputs.begin(), functionOutputs.end(), gradientVarValuePair.first) == functionOutputs.end())
                InvalidArgument("Gradients cannot be specified for a Variable that is not an Output of the Function");

            auto outputComputationNode = m_variableToNodeMap[gradientVarValuePair.first];
-            auto nodeLayout = outputComputationNode->GetMBLayout();
-
            ValuePtr gradientValue = gradientVarValuePair.second;

-            MBLayoutPtr layout;
            switch (gradientValue->Data()->GetDataType())
            {
            case DataType::Float:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(gradientVarValuePair.first, gradientValue);
-                layout = CNTKMatrixAndMBLayout.second;
-                if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-                    InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
-                outputComputationNode->As<ComputationNode<float>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeGradient<float>(gradientVarValuePair, outputComputationNode);
                break;
-            }
            case DataType::Double:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(gradientVarValuePair.first, gradientValue);
-                layout = CNTKMatrixAndMBLayout.second;
-                if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-                    InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
-                outputComputationNode->As<ComputationNode<double>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(gradientValue->Data()->GetDataType()));
                break;
@ -618,6 +658,8 @@ namespace CNTK
    static NDShape GetValueShape(const Variable& var, const ComputationNodeBasePtr& computationNodePtr)
    {
        size_t outputValueNumAxes = var.Shape().NumAxes();
+
+        // Add the batch and dynamic axes if needed
        if (computationNodePtr->GetMBLayout() != nullptr)
            outputValueNumAxes += 2;

@ -650,37 +692,27 @@ namespace CNTK
                    InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
            }

+            ValuePtr nodeValue;
            switch (outputVarValuePair.first.GetDataType())
            {
            case DataType::Float:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
-                if (outputValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](ReferenceCount* ptr) { delete ptr; });
-                    auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](ReferenceCount* ptr) { delete ptr; }) : nullptr;
-                    outputValuePtr = ValuePtr(new Value(data, mask), [](ReferenceCount* ptr) { delete ptr; });
-                }
-                outputValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
                break;
-            }
            case DataType::Double:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
-                if (outputValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](ReferenceCount* ptr) { delete ptr; });
-                    auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](ReferenceCount* ptr) { delete ptr; }) : nullptr;
-                    outputValuePtr = ValuePtr(new Value(data, mask), [](ReferenceCount* ptr) { delete ptr; });
-                }
-                outputValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
                break;
            }

+            if (outputValuePtr == nullptr)
+            {
+                auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
+                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+                outputValuePtr = MakeSharedObject<Value>(data, mask);
+            }
+            outputValuePtr->CopyFrom(*nodeValue);
            outputs[outputVarValuePair.first] = outputValuePtr;
        }
    }
@ -713,50 +745,40 @@ namespace CNTK
            if (!computationNodePtr->NeedsGradient())
                LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");

+            ValuePtr nodeValue;
            switch (gradientVarValuePair.first.GetDataType())
            {
            case DataType::Float:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
-                if (gradientValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](ReferenceCount* ptr) { delete ptr; });
-                    auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](ReferenceCount* ptr) { delete ptr; });
-                    gradientValuePtr = ValuePtr(new Value(data, mask), [](ReferenceCount* ptr) { delete ptr; });
-                }
-                gradientValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
                break;
-            }
            case DataType::Double:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
-                if (gradientValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](ReferenceCount* ptr) { delete ptr; });
-                    auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](ReferenceCount* ptr) { delete ptr; });
-                    gradientValuePtr = ValuePtr(new Value(data, mask), [](ReferenceCount* ptr) { delete ptr; });
-
-                }
-                gradientValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
                break;
            }

+            if (gradientValuePtr == nullptr)
+            {
+                auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
+                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+                gradientValuePtr = MakeSharedObject<Value>(data, mask);
+            }
+
+            gradientValuePtr->CopyFrom(*nodeValue);
            gradients[gradientVarValuePair.first] = gradientValuePtr;
        }
    }

-    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const Internal::SimpleMap<Variable, const ValuePtr>& arguments,
-                                                            Internal::SimpleMap<Variable, ValuePtr>& outputs,
-                                                            const Internal::SimpleSet<Variable>& outputsToRetainBackwardStateFor,
-                                                            const DeviceDescriptor& computeDevice)
+    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+                                                            std::unordered_map<Variable, ValuePtr>& outputs,
+                                                            const DeviceDescriptor& computeDevice,
+                                                            const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
    {
        // TODO: How about zero argument functions?
        // TODO: We need a better way to determine the ElementType for the network
-        auto dataType = arguments.m_map->begin()->second->Data()->GetDataType();
+        auto dataType = arguments.begin()->second->Data()->GetDataType();
        if (dataType == DataType::Float)
            GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor);
        else
@ -767,10 +789,10 @@ namespace CNTK
        // Feed data into the arguments of the network
        PopulateNetworkInputs(arguments);

-        std::unordered_set<Variable> functionOutputs = Internal::SimpleVector<Variable>(this->Outputs()).GetAsUnorderedSet();
+        std::unordered_set<Variable> functionOutputs(this->Outputs().begin(), this->Outputs().end());
        std::vector<ComputationNodeBasePtr> outputsToEvaluate;

-        for (auto outputVarValuePair : *outputs.m_map)
+        for (auto outputVarValuePair : outputs)
        {
            // Ensure that only a subset of this function's outputs are being asked to be evaluated
            if (functionOutputs.find(outputVarValuePair.first) == functionOutputs.end())
@ -781,128 +803,105 @@ namespace CNTK
        }

        // The 'outputsToRetainBackwardStateFor' nodes also need to be evaluated if not already specified in 'outputs'
-        for (auto rootVarForBackprop : *outputsToRetainBackwardStateFor.m_set)
+        for (auto rootVarForBackprop : outputsToRetainBackwardStateFor)
        {
-            if (outputs.m_map->find(rootVarForBackprop) == outputs.m_map->end())
+            if (outputs.find(rootVarForBackprop) == outputs.end())
                outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
        }

        m_computationNetwork->ForwardProp(outputsToEvaluate);

-        GetNetworkOutputs(*(outputs.m_map));
+        GetNetworkOutputs(outputs);

        // TODO: How to deal with the specified 'computeDevice'

-        return (outputsToRetainBackwardStateFor.Size() > 0) ? BackPropStatePtr(new CNTKBackPropState(this, { arguments.m_map->begin()->first, m_variableToNodeMap[arguments.m_map->begin()->first]->GetEvalTimeStamp() }), [](ReferenceCount* ptr) { delete ptr; }) : nullptr;
+        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), std::make_pair(arguments.begin()->first, m_variableToNodeMap[arguments.begin()->first]->GetEvalTimeStamp())) : nullptr;
    }

    /*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
-                                                 const Internal::SimpleMap<Variable, const ValuePtr>& rootGradientValues,
-                                                 Internal::SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
+                                                 const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                                                 std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
    {
-        if ((state == nullptr) || (dynamic_cast<const CNTKBackPropState*>(state.GetPtr()) == nullptr))
+        auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
+        if (backpropState == nullptr)
            InvalidArgument("Invalid backprop state specified");

        // TODO: Support multiple concurrent backprop states
-        auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.GetPtr());
        if (backpropState->EvalTimeStamp().second != m_variableToNodeMap[backpropState->EvalTimeStamp().first]->GetEvalTimeStamp())
            LogicError("The specified backprop state specified cannot be used for backpropagation as the Function's internal state was modified by subsequent Forward calls to the function."
                       "This is not a user error but a shortcoming of the current implementation where multiple independent backprop states are not simultaneously supported");

-        if (rootGradientValues.Size() > 1)
+        if (rootGradientValues.size() > 1)
            LogicError("Currently gradient backprop from only one of the Function Outputs is supported");

        // TODO: Avoid copying the data when possible

        // Zero all gradients of nodes below the root nodes
-        for (auto rootGradientVarValuePair : *rootGradientValues.m_map)
+        for (auto rootGradientVarValuePair : rootGradientValues)
            m_computationNetwork->ZeroInputGradients(m_variableToNodeMap[rootGradientVarValuePair.first]);

        // Feed data into the arguments of the network
        PopulateNetworkGradients(rootGradientValues);

        // Backpropagate through the network
-        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.m_map->begin()->first];
+        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
        m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);

-        GetNetworkGradients(*(backPropagatedGradientValuesForInputs.m_map));
+        GetNetworkGradients(backPropagatedGradientValuesForInputs);

        // TODO: How to deal with the specified 'computeDevice'
    }

-    /*virtual*/ void CompositeFunction::_ReplacePlaceholders(const Internal::SimpleMap<Placeholder, Variable>& placeholderReplacements, Internal::SimpleSet<const Function*>& visitedFunctions, Internal::SimpleSet<Placeholder>& replacedPlaceholders)
-    {
-        RootFunction()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
-
-        // If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
-        for (auto replacedPlaceholder : *replacedPlaceholders.m_set)
-        {
-            auto replacingVariable = placeholderReplacements[replacedPlaceholder];
-            if (replacingVariable.IsOutput())
-            {
-                auto ownerFunc = replacingVariable.Owner();
-                Internal::SimpleSet<FunctionPtr> visitedFunctions;
-                DetermineInputs(ownerFunc, visitedFunctions);
-
-                // Add the newly visited functions to 'm_allPrimitiveFunctions' set
-                m_allPrimitiveFunctions.m_set->insert(visitedFunctions.m_set->begin(), visitedFunctions.m_set->end());
-            }
-        }
-    }
-
    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Times, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Plus, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Sigmoid, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

    FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Tanh, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

-    namespace Internal
+    FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
    {
-        FunctionPtr Combine(const Internal::SimpleVector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
+        std::unordered_set<FunctionPtr> uniqueOperands;
+        std::vector<Variable> inputs;
+        for (auto operand : operands)
        {
-            Internal::SimpleSet<FunctionPtr> uniqueOperands;
-            std::vector<Variable> inputs;
-            for (size_t i = 0; i < operands.Size(); ++i)
-            {
-                if (uniqueOperands.Contains(operands[i]))
-                    LogicError("All function operands specified to Combine must be unique");
+            if (uniqueOperands.find(operand) != uniqueOperands.end())
+                LogicError("All function operands specified to Combine must be unique");

-                uniqueOperands.Insert(operands[i]);
-                auto currentFunctionOutputs = operands[i]->Outputs();
-                std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
-            }
-
-            return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
+            uniqueOperands.insert(operand);
+            auto currentFunctionOutputs = operand->Outputs();
+            std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
        }
+
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
    }

    FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::CrossEntropyWithSoftmax, { output, labels }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
    }

    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ClassificationError, { prediction, labels }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
    }

    FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Exp, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -912,7 +911,7 @@ namespace CNTK

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::PastValue, { initialState, operand }, std::move(additionalProperties), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
    }

    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -922,16 +921,16 @@ namespace CNTK

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::FutureValue, { initialState, operand }, std::move(additionalProperties), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
    }

    FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ElementTimes, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ReduceSum, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -68,17 +68,17 @@ namespace CNTK
        {
        }

-        virtual BackPropStatePtr Forward(const Internal::SimpleMap<Variable, const ValuePtr>& /*arguments*/,
-                                         Internal::SimpleMap<Variable, ValuePtr>& /*outputs*/,
-                                         const Internal::SimpleSet<Variable>& /*outputsToRetainBackwardStateFor*/,
-                                         const DeviceDescriptor& /*computeDevice*/) override
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
+                                         std::unordered_map<Variable, ValuePtr>& /*outputs*/,
+                                         const DeviceDescriptor& /*computeDevice*/,
+                                         const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
        {
            NOT_IMPLEMENTED;
        }

        virtual void Backward(const BackPropStatePtr& /*state*/,
-                              const Internal::SimpleMap<Variable, const ValuePtr>& /*rootGradientValues*/,
-                              Internal::SimpleMap<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
+                              const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
+                              std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
        {
            NOT_IMPLEMENTED;
        }
@ -280,54 +280,58 @@ namespace CNTK
    private:
        std::pair<Variable, int64_t> m_evalTimeStamp;
    };
-    typedef Internal::ReferenceCountedPtr<CNTKBackPropState> CNTKBackPropStatePtr;
+    typedef std::shared_ptr<CNTKBackPropState> CNTKBackPropStatePtr;

    class CompositeFunction;
-    typedef Internal::ReferenceCountedPtr<CompositeFunction> CompositeFunctionPtr;
+    typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;

    class CompositeFunction final : public Function
    {
        friend class Function;

+        template <typename T, typename ...CtorArgTypes>
+        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
+
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
-            Internal::SimpleSet<FunctionPtr> visitedFunctions;
+            std::unordered_set<FunctionPtr> visitedFunctions;

            // Call DetermineInputs to get the set of all functions in the graph
            DetermineInputs(rootFunction, visitedFunctions);

-            auto func = new CompositeFunction(rootFunction, std::move(visitedFunctions), name);
-            return CompositeFunctionPtr(func, [](ReferenceCount* ptr) { delete ptr; });
+            return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
        }

-        virtual BackPropStatePtr Forward(const Internal::SimpleMap<Variable, const ValuePtr>& arguments,
-                                         Internal::SimpleMap<Variable, ValuePtr>& outputs,
-                                         const Internal::SimpleSet<Variable>& outputsToRetainBackwardStateFor,
-                                         const DeviceDescriptor& computeDevice) override;
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+                                         std::unordered_map<Variable, ValuePtr>& outputs,
+                                         const DeviceDescriptor& computeDevice,
+                                         const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;

        virtual void Backward(const BackPropStatePtr& state,
-                              const Internal::SimpleMap<Variable, const ValuePtr>& rootGradientValues,
-                              Internal::SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
+                              const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                              std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;

    private:
-        virtual void _ReplacePlaceholders(const Internal::SimpleMap<Placeholder, Variable>& placeholderReplacements, Internal::SimpleSet<const Function*>& visitedFunctions, Internal::SimpleSet<Placeholder>& replacedPlaceholders) override;
+        virtual void ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                         std::unordered_set<const Function*>& visitedFunctions,
+                                         std::unordered_set<Placeholder>& replacedPlaceholders) override;

-        CompositeFunction(const FunctionPtr& rootFunction, Internal::SimpleSet<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
+        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
            : Function({}, rootFunction->Outputs(), rootFunction, name), m_allPrimitiveFunctions(std::move(allPrimitiveFunctions))
        {
        }

        std::vector<Variable> DetermineInputs() const
        {
-            Internal::SimpleSet<FunctionPtr> visitedFunctions;
+            std::unordered_set<FunctionPtr> visitedFunctions;
            return DetermineInputs(RootFunction(), visitedFunctions);
        }

        // Recursively traverses the Function graph underlying the 'rootFunction' to determine all the leaves (aka inputs) of the graph
-        static std::vector<Variable> DetermineInputs(const FunctionPtr& rootFunction, Internal::SimpleSet<FunctionPtr>& visitedFunctions)
+        static std::vector<Variable> DetermineInputs(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>& visitedFunctions)
        {
-            visitedFunctions.Insert(rootFunction);
+            visitedFunctions.insert(rootFunction);

            std::vector<Variable> inputs;
            std::vector<Variable> rootFunctionInputs = rootFunction->Inputs();
@ -335,7 +339,7 @@ namespace CNTK
            {
                if (!rootInput.IsOutput())
                    inputs.push_back(rootInput);
-                else if (!visitedFunctions.Contains(rootInput.Owner()))
+                else if (visitedFunctions.find(rootInput.Owner()) == visitedFunctions.end())
                {
                    FunctionPtr function = rootInput.Owner();
                    std::vector<Variable> functionInputs = DetermineInputs(function, visitedFunctions);
@ -347,7 +351,7 @@ namespace CNTK
        }

        template <typename ElementType>
-        Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const Internal::SimpleSet<Variable>& backpropRoots);
+        Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots);

        template <typename ElementType>
        static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetOutputVariableNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);
@ -355,8 +359,13 @@ namespace CNTK
        template <typename ElementType>
        static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);

-        void PopulateNetworkInputs(const Internal::SimpleMap<Variable, const ValuePtr>& arguments);
-        void PopulateNetworkGradients(const Internal::SimpleMap<Variable, const ValuePtr>& gradients);
+        template <typename ElementType>
+        static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
+        void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
+
+        template <typename ElementType>
+        static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
+        void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);

        void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
        void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
@ -371,7 +380,7 @@ namespace CNTK

        // Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive 
        // by holding strong references to them
-        Internal::SimpleSet<FunctionPtr> m_allPrimitiveFunctions;
+        std::unordered_set<FunctionPtr> m_allPrimitiveFunctions;

        // A map from Variable objects to ComputationNode objects in the ComputationNetwork instance that implements 'this' Composite Function
        std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr> m_variableToNodeMap;
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -0,0 +1,464 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "Learner.h"
+#include "TensorView.h"
+#include "Utils.h"
+
+#define UPDATE_FUNCTION                                                                                       \
+    switch (smoothedGradientValue->Data()->GetDataType())                                                     \
+    {                                                                                                         \
+    case DataType::Float:                                                                                     \
+        Update<float>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);  \
+        break;                                                                                                \
+    case DataType::Double:                                                                                    \
+        Update<double>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount); \
+        break;                                                                                                \
+    default:                                                                                                  \
+        NOT_IMPLEMENTED;                                                                                      \
+    }
+
+
+using namespace Microsoft::MSR::CNTK;
+using namespace std;
+
+namespace CNTK
+{
+    template <typename ElementType>
+    /*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetWritableMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetTensorView<ElementType>();
+    }
+
+    /*static*/ bool LearnerBase::HasNan(const ValuePtr& value, const char* name)
+    {
+        const auto& data = value->Data();
+        switch (data->GetDataType())
+        {
+        case DataType::Float:
+            return data->GetMatrix<float>()->HasNan(name);
+        case DataType::Double:
+            return data->GetMatrix<double>()->HasNan(name);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
+        }
+    }
+
+    /*static*/ void LearnerBase::Print(const ValuePtr& value, const char* msg)
+    {
+        const auto& data = value->Data();
+        switch (data->GetDataType())
+        {
+        case DataType::Float:
+            data->GetMatrix<float>()->Print(msg);
+            break;
+        case DataType::Double:
+            data->GetMatrix<double>()->Print(msg);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
+        }
+    }
+
+    // Clipping gradients to prevent outliers,
+    template <typename ElementType>
+    void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
+    {
+        if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
+        {
+            double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
+            if (m_additionalOptions.gradientClippingWithTruncation)
+                gradient.InplaceTruncate(ElementType(maxGradientPerMB));
+            else
+            {
+                // norm2 normalized
+                double gradientNorm = gradient.FrobeniusNorm();
+                if (gradientNorm > maxGradientPerMB)
+                {
+                    double normFactor = maxGradientPerMB / gradientNorm;
+                    gradient *= ElementType(normFactor);
+                }
+            }
+        }
+    }
+
+    // Performs additional preprocessing before calling the update method 
+    // (gradient clipping and L2 regularization depending on the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PreProcess(const ValuePtr& gradientValue,const ValuePtr& parameterValue, size_t actualMBSize) const
+    {
+        const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
+
+        // clipping gradients to prevent outliers
+        ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
+
+        // L2 regularizer
+        if (m_additionalOptions.l2RegularizationWeight > 0)
+        {
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
+            const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
+            Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
+        }
+    }
+
+    // Performs additional postprocessing after the update method has been executed
+    // (noise injection and L1 regularization specified by the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
+                                    const ValuePtr& parameterValue, size_t actualMBSize) const
+    {
+        const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
+        if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
+        {
+            const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
+
+            Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
+
+            // get the gradient structure since gradient is sparse
+            sgdUpdateNoise.SetValue(*gradientMatrix);
+
+            auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
+
+            // reset its value to random
+            sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
+
+            Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
+        }
+
+        // L1 regularizer with proximal gradient descent method
+        if (m_additionalOptions.l1RegularizationWeight > 0)
+        {
+            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
+            parameterValue->Data()->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
+        }
+    }
+
+    template <typename ElementType>
+    /*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetWritableTensorView<ElementType>();
+    }
+
+    LearnerBase::LearnerBase(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+        : Learner(parameters),
+        m_learningRatePerSample(0.0),
+        m_sampleCount(0)
+    {
+        const unordered_set<Variable>& parameterSet = parameters;
+        for (const auto& parameter : parameterSet)
+        {
+            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
+            // Should the device be specified on the per-parameter basis?
+            NDArrayViewPtr view;
+            if (parameter.GetDataType() == DataType::Float)
+            {
+                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), device);
+            }
+            else
+            {
+                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), device);
+            }
+
+            m_smoothedGradientValues.insert(make_pair(parameter, MakeSharedObject<Value>(view)));
+            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
+        }
+    }
+
+    void LearnerBase::ResetSmoothedGradients()
+    {
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& data = smoothedGradientValue->Data();
+            switch (data->GetDataType())
+            {
+            case DataType::Float:
+                data->SetValue(0.0f);
+                break;
+            case DataType::Double:
+                data->SetValue(0.0);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
+            }
+        }
+    }
+
+    /*virtual*/ bool LearnerBase::Update(const unordered_map<Variable, ValuePtr>& parameterValues,
+                                            const unordered_map<Variable, const ValuePtr>& gradientValues,
+                                            size_t trainingSampleCount) /*override*/
+    {
+        // make sure trainingSampleCount is a valid value
+        assert(trainingSampleCount > 0);
+
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& gradientValue = gradientValues.at(parameter);
+            const auto& parameterValue = parameterValues.at(parameter);
+
+// TODO: make this a runtime parameter.
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
+#endif
+
+#ifdef _DEBUG
+            if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
+#endif
+
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
+                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+            Print(gradientValue, "Gradient Update");
+            Print(smoothedGradientValue, "Smoothed Gradient Input");
+#endif
+            UPDATE_FUNCTION;
+
+#if DUMPOUTPUT
+            Print(parameterValue, "Parameter Update");
+#endif
+
+#ifdef _DEBUG
+            if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
+#endif
+        }
+        m_sampleCount += trainingSampleCount;
+        return false;
+    }
+
+    template <typename ElementType>
+    void LearnerBase::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                             const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        PreProcess<ElementType>(gradientValue, parameterValue, trainingSampleCount);
+        Update(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);
+        PostProcess<ElementType>(parameter, gradientValue, parameterValue, trainingSampleCount);
+    }
+
+    string LearnerBase::LearnerType() const
+    {
+        auto name = typeid(*this).name(); 
+        if (strncmp(name, "class ", 6) == 0)
+        {
+            // On Windows, the type name contains "class" prefix. 
+            // Return the actual name, omitting the prefix.
+            return &name[6];
+        } 
+        return name;
+    }
+
+    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        Dictionary checkpoint;
+
+        for (const auto& parameter : Parameters())
+        {
+            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
+            // need to expose "UId" property -- a persistent unique internal name.
+            // Switch to UId as soon as it's available.
+            if (checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Parameter names must be unique");
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            // Potentially, could store things like dimensions, element size, format, etc., but
+            // that seems to be redundant, since all of that is passed in the constructor.
+            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue->Data());
+        }
+        return checkpoint;
+    }
+
+    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        for (const auto& parameter : Parameters())
+        {
+            if (!checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            const DictionaryValue& state = checkpoint[parameter.Name()];
+
+            const auto& data = smoothedGradientValue->Data();
+
+            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
+        }
+    }
+
+    /*virtual*/ void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                        const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
+        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
+        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+    }
+
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
+        : LearnerBase(parameters, device),
+        m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+        : LearnerMomentumSGD(parameters, device)
+    {
+    }
+
+    /*virtual*/ void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                  const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample));
+    }
+
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Variable>& parameters,
+                                    double gamma, double inc, double dec, double max, double min,
+                                    bool needAveMultiplier, const DeviceDescriptor& device)
+                                    : LearnerBase(parameters, device),
+                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                    m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
+                                                                ElementType(m_gamma), ElementType(m_inc),
+                                                                ElementType(m_max), ElementType(m_dec),
+                                                                ElementType(m_min), m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    // Explicit template instantiations
+    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr arrayView);
+    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr arrayView);
+    
+    LearnerPtr SGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerSGD>(parameters, device);
+    }
+
+    LearnerPtr MomentumSGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, device);
+    }
+
+    LearnerPtr NesterovLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerNesterov>(parameters, device);
+    }
+
+    LearnerPtr AdaGradLearner(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier, device);
+    }
+
+    LearnerPtr FSAdaGradLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, device);
+    }
+
+    LearnerPtr RMSPropLearner(const unordered_set<Variable>& parameters,
+                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
+                                const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier, device);
+    }
+
+}
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -0,0 +1,224 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+
+namespace CNTK 
+{
+    // A collection of additional options that are applicable for all standard learners 
+    // (after these options are set, they retain their value for the entire lifespan of a learner).
+    struct AdditionalLearningOptions
+    {
+        double l1RegularizationWeight = 0.0;
+        double l2RegularizationWeight = 0.0;
+        double gaussianNoiseInjectionStdDev = 0.0;
+        bool gradientClippingWithTruncation = false;
+        double gradientClippingThresholdPerSample = 0.0;
+        std::unordered_map<Variable, double> learningRateMultipliers;
+    };
+
+    // An abstract base class at the root of the standard learners hierarchy
+    // It implements most of the learner functionality, except for the actual update function,
+    // and adds a few pre-/postprocessing methods (which are invoked before and after the update).
+    class LearnerBase : public Learner
+    {
+    public:
+
+        CNTK_API virtual bool Update(const std::unordered_map<Variable, ValuePtr>& parameterValues,
+                                     const std::unordered_map<Variable, const ValuePtr>& gradientValues,
+                                     size_t trainingSampleCount) override final;
+
+        CNTK_API virtual Dictionary GetCheckpointState() const override;
+
+        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override;
+
+        CNTK_API void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
+        {
+            m_additionalOptions = additionalOptions;
+        }
+
+        // TODO: should this be called ResetMomentum?
+        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
+        CNTK_API void ResetSmoothedGradients();
+
+        // TODO: move learning rate and momentum scheduling and adjustment functionality 
+        // inside the learner and drop these setters.
+        void SetLearningRate(double value) { m_learningRatePerSample = value; }
+
+    protected:
+        LearnerBase(const std::unordered_set<Variable>& parameters,
+                    const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const = 0;
+
+        double ParameterDependentLearningRate(const Variable& parameter) const
+        {
+            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
+        }
+
+        std::string LearnerType() const;
+
+        double m_learningRatePerSample;
+
+        AdditionalLearningOptions m_additionalOptions;
+
+        std::unordered_map<Variable, ValuePtr> m_smoothedGradientValues;
+
+        // The following four static protected methods expose private methods of NDArrayView class
+        // (which declares LearnerBase as friend class), so that they are available to subclasses.
+        template <typename ElementType>
+        static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
+
+        // Performs additional preprocessing before calling the update method 
+        // (gradient clipping and L2 regularization depending on the additional learning parameters).
+        template <typename ElementType>
+        void PreProcess(const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t actualMBSize) const;
+
+        // Performs additional postprocessing after the update method has been executed
+        // (noise injection and L1 regularization specified by the additional learning parameters).
+        template <typename ElementType>
+        void PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
+                         const ValuePtr& parameterValue, size_t actualMBSize) const;
+    private:
+        // Templatized update function, it invokes preprocess and postprocess using the provided
+        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+
+        // TODO: make these functions friends of NDViewArray and move to Utils?
+        static bool HasNan(const ValuePtr& value, const char* name);
+        static void Print(const ValuePtr& value, const char* msg);
+
+        size_t m_sampleCount;
+    };
+
+    // Vanilla gradient descent optimization algorithm.
+    class LearnerSGD : public LearnerBase
+    {
+    public:
+
+        LearnerSGD(const std::unordered_set<Variable>& parameters,
+                   const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                   : LearnerBase(parameters, device),
+                   m_momentumPerSample(0.0),
+                   m_useNesterovAcceleration(false)
+        {
+        }
+
+    protected:
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+
+        double m_momentumPerSample;
+        bool m_useNesterovAcceleration;
+    };
+
+    // SGD optimization with momentum. 
+    class LearnerMomentumSGD : public LearnerSGD
+    {
+    public:
+
+        LearnerMomentumSGD(const std::unordered_set<Variable>& parameters,
+                           const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                           : LearnerSGD(parameters, device)
+        {
+        }
+
+        void SetMomentum(double value) { m_momentumPerSample = value; }
+    };
+
+    // Nesterov's accelerated SGDLearnerBase descent. 
+    class LearnerNesterov : public LearnerSGD
+    {
+    public:
+
+        LearnerNesterov(const std::unordered_set<Variable>& parameters,
+                        const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                        : LearnerSGD(parameters, device)
+        {
+            m_useNesterovAcceleration = true;
+        }
+    };
+
+    class LearnerAdaGrad : public LearnerBase
+    {
+    public:
+
+        LearnerAdaGrad(const std::unordered_set<Variable>& parameters, bool needAveMultiplier,
+                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerFSAdaGrad : public LearnerMomentumSGD
+    {
+    public:
+
+        LearnerFSAdaGrad(const std::unordered_set<Variable>& parameters,
+                         const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerRMSProp : public LearnerBase
+    {
+    public:
+
+        LearnerRMSProp(const std::unordered_set<Variable>& parameters,
+                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
+                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+
+        double m_gamma;
+        double m_inc;
+        double m_dec;
+        double m_max;
+        double m_min;
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+}
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -60,7 +60,7 @@ namespace CNTK
                                                                                            matrixDims.second,
                                                                                            AsCNTKImplDeviceId(device),
                                                                                            IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                                            AsCNTKMatrixFormat(storageType));
+                                                                                            AsCNTKImplMatrixFormat(storageType));
        return new TensorView<ElementType>(matrix, AsTensorShape(viewShape));
    }

@ -99,8 +99,22 @@ namespace CNTK
    }

    NDArrayView::NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView)
-        : m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly), m_tensorView(tensorView)
+        : m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly)
    {
+        m_tensorView = std::shared_ptr<void>(tensorView, [this](void*) {
+            switch (m_dataType)
+            {
+            case DataType::Float:
+                delete GetTensorView<float>();
+                break;
+            case DataType::Double:
+                delete GetTensorView<double>();
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
+                break;
+            }
+        });
    }

    NDArrayView::NDArrayView(CNTK::DataType dataType, CNTK::StorageFormat storageType, const NDShape& viewShape, const DeviceDescriptor& device)
@ -108,6 +122,10 @@ namespace CNTK
    {
    }

+    NDArrayView::~NDArrayView()
+    {
+    }
+
    void NDArrayView::SetValue(float value)
    {
        if (IsSparse())
@ -124,22 +142,6 @@ namespace CNTK
        GetWritableMatrix<double>()->SetValue(value);
    }

-    NDArrayView::~NDArrayView()
-    {
-        switch (m_dataType)
-        {
-        case DataType::Float:
-            delete GetTensorView<float>();
-            break;
-        case DataType::Double:
-            delete GetTensorView<double>();
-            break;
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
-            break;
-        }
-    }
-
    template <typename ElementType>
    /*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
    {
@ -150,7 +152,8 @@ namespace CNTK
        size_t splitPoint = rowColSplitPoint;
        if (splitPoint == NDArrayView::AutoSelectRowColSplitPoint)
        {
-            // Determine the split point
+            // Determine the split point by determining which of the axes can be 
+            // folded and selecting the non-foldable axis as the split point
            std::vector<bool> dimsToDrop(tensorShape.GetRank(), false);
            for (size_t k = 1; k < tensorShape.GetRank(); ++k)
                if (tensorShape.CanFlatten(k))
@ -197,7 +200,7 @@ namespace CNTK
        if (AsDataType<ElementType>() != m_dataType)
            LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));

-        return (const TensorView<ElementType>*)(m_tensorView);
+        return (const TensorView<ElementType>*)(m_tensorView.get());
    }

    template <typename ElementType>
@ -211,7 +214,7 @@ namespace CNTK

    NDArrayViewPtr NDArrayView::DeepClone(bool readOnly/* = false*/) const
    {
-        NDArrayViewPtr newView(new NDArrayView(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device()), [](ReferenceCount* ptr) { delete ptr; });
+        NDArrayViewPtr newView = MakeSharedObject<NDArrayView>(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device());
        switch (m_dataType)
        {
        case DataType::Float:
@ -234,9 +237,7 @@ namespace CNTK
        }

        newView->m_isReadOnly = readOnly;
-        return NDArrayViewPtr(newView, [](ReferenceCount* ptr) {
-            delete ptr;
-        });
+        return newView;
    }

    void NDArrayView::CopyFrom(const NDArrayView& source)
@ -285,8 +286,7 @@ namespace CNTK
            break;
        }

-        auto aliasView = new NDArrayView(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);;
-        return NDArrayViewPtr(aliasView, [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);
    }

    // TODO: This could actually be strided?
@ -322,8 +322,7 @@ namespace CNTK
        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
        auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorShape(shape));

-        auto view = new NDArrayView(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
-        return NDArrayViewPtr(view, [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
    }

    // Explicit template instantiations
@ -339,8 +338,10 @@ namespace CNTK
    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;

-    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
-    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
+    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();

    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
--- a/Source/CNTKv2LibraryDll/NDMask.cpp
+++ b/Source/CNTKv2LibraryDll/NDMask.cpp
@ -17,15 +17,13 @@ namespace CNTK
    static Matrix<char>* AllocateMatrix(const NDShape& viewShape, const DeviceDescriptor& device)
    {
        auto matrixDims = GetMatrixDimensions(viewShape);
-        auto maskMatrix = new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
-        maskMatrix->SetValue(1);
-
-        return maskMatrix;
+        return new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
    }

    NDMask::NDMask(const NDShape& shape, Matrix<char>* matrix)
-        : m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape), m_matrixView(matrix)
+        : m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape)
    {
+        m_matrixView = std::shared_ptr<Matrix<char>>(matrix, [](Matrix<char>* ptr) { delete ptr; });
    }

    NDMask::NDMask(const NDShape& shape, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
@ -33,16 +31,17 @@ namespace CNTK
    {
        if (shape.NumAxes() > 2)
            LogicError("NDMask instances with more than 2 axes are currently unsupported");
+
+        Clear();
    }

    NDMask::~NDMask()
    {
-        delete m_matrixView;
    }

    void NDMask::MaskSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape)
    {
-        // TODO: Implement batching of masking operation for masks residing on GPUs to avoid making 
+        // TODO: Implement batching of masking operation for masks residing on GPUs to avoid making
        // GPU invocations for each MaskSection call.

        if (sectionOffset.size() > m_maskShape.NumAxes())
@ -78,12 +77,13 @@ namespace CNTK

    void NDMask::Clear()
    {
+        // Clear the mask by marking all samples as Valid; i.e. a value of 1
        GetMatrix()->SetValue(1);
    }

    Matrix<char>* NDMask::GetMatrix() const
    {
-        return m_matrixView;
+        return m_matrixView.get();
    }

    void NDMask::CopyFrom(const NDMask& source)
@ -96,14 +96,14 @@ namespace CNTK

    NDMaskPtr NDMask::DeepClone() const
    {
-        NDMaskPtr newMask = new NDMask(this->Shape(), this->Device());
+        NDMaskPtr newMask = MakeSharedObject<NDMask>(this->Shape(), this->Device());
        newMask->CopyFrom(*this);

-        return NDMaskPtr(newMask, [](ReferenceCount* ptr) { delete ptr; });
+        return newMask;
    }

    NDMaskPtr NDMask::Alias() const
    {
-        return NDMaskPtr(new NDMask(this->Shape(), new Matrix<char>(GetMatrix()->AsReference())), [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<NDMask>(this->Shape(), new Matrix<char>(GetMatrix()->AsReference()));
    }
 }
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,354 +6,138 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
+#include "File.h"
+
+using namespace std;

 namespace CNTK
 {
-    namespace Internal
+    template <typename T>
+    void DictionaryValue::AllocateDataPtr(const T& value)
    {
-        ReferenceCount::ReferenceCount()
-            : m_rc(new std::atomic<size_t>(0)) 
-        {}
+        static_assert(is_same<T, NDShape>::value || is_same<T, vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
+        m_data.m_ptr = new T(value);
+    }

-        /*virtual*/ ReferenceCount::~ReferenceCount() 
+    template <typename T>
+    void DictionaryValue::FreePtrAsType()
+    {
+        T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
+        delete typedPtr;
+
+        m_data.m_ptr = nullptr;
+    }
+
+    void DictionaryValue::FreeDataPtr()
+    {
+        if (m_valueType == Type::NDShape)
+            FreePtrAsType<NDShape>();
+        else if (m_valueType == Type::Vector)
+            FreePtrAsType<vector<DictionaryValue>>();
+    }
+
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    {
+        size_t version;
+        stream >> version;
+
+        stream >> us.m_valueType;
+
+        switch (us.ValueType())
        {
-            delete m_rc;
-        }
-
-        size_t ReferenceCount::AddReference()
+        case DictionaryValue::Type::Bool:
+            stream >> us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream >> us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream >> us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream >> us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
        {
-            return ++(*m_rc);
-        }
-
-        size_t ReferenceCount::RemoveReference()
-        {
-            assert(m_rc->load() > 0);
-            return --(*m_rc);
-        }
-
-        size_t ReferenceCount::GetReferenceCount()
-        {
-            return m_rc->load();
-        }
-
-#pragma region SimpleVector
-
-        template <typename T>
-        SimpleVector<T>::SimpleVector()
-            : m_vector(new std::vector<T>())
-        {
-        }
-
-        template <typename T>
-        SimpleVector<T>::SimpleVector(size_t numElements, const T& initVal/* = T()*/)
-            : m_vector(new std::vector<T>(numElements, initVal))
-        {
-        }
-
-        template <typename T>
-        SimpleVector<T>::~SimpleVector()
-        {
-            delete m_vector;
-        }
-
-        template <typename T>
-        SimpleVector<T>::SimpleVector(const SimpleVector<T>& other)
-            : m_vector(new std::vector<T>(*other.m_vector))
-        {
-        }
-
-        template <typename T>
-        SimpleVector<T>& SimpleVector<T>::operator=(const SimpleVector<T>& other)
-        {
-            if (this != &other)
+            size_t size;
+            stream >> size;
+            vector<size_t> dims(size);
+            for (auto i = 0; i < size; i++)
            {
-                delete m_vector;
-                m_vector = new std::vector<T>(*other.m_vector);
+                stream >> dims[i];
            }
-
-            return *this;
+            us.AllocateDataPtr(NDShape(dims));
+            break;
        }
-
-        template <typename T>
-        SimpleVector<T>::SimpleVector(SimpleVector<T>&& other)
-            : m_vector(nullptr)
+        case DictionaryValue::Type::Vector:
        {
-            *this = std::move(other);
-        }
-
-        template <typename T>
-        SimpleVector<T>& SimpleVector<T>::operator=(SimpleVector<T>&& other)
-        {
-            assert(this != &other);
-
-            delete m_vector;
-            m_vector = other.m_vector;
-
-            other.m_vector = nullptr;
-
-            return *this;
-        }
-
-        template <typename T>
-        T& SimpleVector<T>::operator[](size_t idx)
-        {
-            assert(idx < Size());
-            return (*m_vector)[idx];
-        }
-
-        template <typename T>
-        const T& SimpleVector<T>::operator[](size_t idx) const
-        {
-            assert(idx < Size());
-            return (*m_vector)[idx];
-        }
-
-        template <typename T>
-        size_t SimpleVector<T>::Size() const
-        {
-            return m_vector->size();
-        }
-
-        template <typename T>
-        T* SimpleVector<T>::Data()
-        {
-            return m_vector->data();
-        }
-
-        template <typename T>
-        const T* SimpleVector<T>::Data() const
-        {
-            return m_vector->data();
-        }
-
-        template <typename T>
-        void SimpleVector<T>::PushBack(const T& value)
-        {
-            m_vector->push_back(value);
-        }
-
-        template <typename T>
-        void SimpleVector<T>::PushBack(T&& value)
-        {
-            m_vector->push_back(std::move(value));
-        }
-
-        template <typename ValueType>
-        bool operator==(const SimpleVector<ValueType>& first, const SimpleVector<ValueType>& second)
-        {
-            return *first.m_vector == *second.m_vector;
-        }
-
-        // Explicit template instantiations
-        template class SimpleVector<Variable>;
-        template class SimpleVector<size_t>;
-        template class SimpleVector<Axis>;
-        template class SimpleVector<FunctionPtr>;
-
-        template bool operator==(const SimpleVector<size_t>& first, const SimpleVector<size_t>& second);
-	
-#pragma endregion SimpleVector
-
-#pragma region SimpleSet
-
-        template <typename KeyType>
-        SimpleSet<KeyType>::SimpleSet()
-            : m_set(new std::unordered_set<KeyType>())
-        {
-        }
-
-        template <typename KeyType>
-        SimpleSet<KeyType>::~SimpleSet()
-        {
-            delete m_set;
-        }
-
-        template <typename KeyType>
-        SimpleSet<KeyType>::SimpleSet(const SimpleSet& other)
-            : m_set(nullptr)
-        {
-            *this = other;
-        }
-
-        template <typename KeyType>
-        SimpleSet<KeyType>& SimpleSet<KeyType>::operator=(const SimpleSet& other)
-        {
-            if (this != &other)
+            size_t size;
+            stream >> size;
+            vector<DictionaryValue> values(size);
+            for (auto i = 0; i < size; i++)
            {
-                delete m_set;
-                m_set = new std::unordered_set<KeyType>(*(other.m_set));
+                stream >> values[i];
            }
-
-            return *this;
+            us.AllocateDataPtr(values);
+            break;
        }
-
-        template <typename KeyType>
-        SimpleSet<KeyType>::SimpleSet(SimpleSet&& other)
-            : m_set(nullptr)
-        {
-            *this = std::move(other);
+        default:
+            NOT_IMPLEMENTED;
        }
+        return stream;
+    }

-        template <typename KeyType>
-        SimpleSet<KeyType>& SimpleSet<KeyType>::operator=(SimpleSet&& other)
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    {
+        stream << us.version;
+
+        stream << us.ValueType();
+
+        switch (us.ValueType())
        {
-            assert(this != &other);
-
-            delete m_set;
-            m_set = other.m_set;
-            other.m_set = nullptr;
-
-            return *this;
-        }
-
-        template <typename KeyType>
-        bool SimpleSet<KeyType>::Insert(const KeyType& key)
+        case DictionaryValue::Type::Bool:
+            stream << us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream << us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream << us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream << us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
        {
-            return m_set->insert(key).second;
-        }
-
-        template <typename KeyType>
-        bool SimpleSet<KeyType>::Contains(const KeyType& key) const
-        {
-            return (m_set->find(key) != m_set->end());
-        }
-
-        template <typename KeyType>
-        size_t SimpleSet<KeyType>::Size() const
-        {
-            return m_set->size();
-        }
-
-        template <typename KeyType>
-        SimpleSet<KeyType>::operator SimpleVector<KeyType>() const
-        {
-            SimpleVector<KeyType> retVector;
-            for (auto key : *m_set)
-                retVector.PushBack(key);
-
-            return retVector;
-        }
-
-        template <typename KeyType>
-        bool operator==(const SimpleSet<KeyType>& first, const SimpleSet<KeyType>& second)
-        {
-            return *first.m_set == *second.m_set;
-        }
-
-        // Explicit template instantiations
-        template class SimpleSet<FunctionPtr>;
-        template class SimpleSet<Variable>;
-        template class SimpleSet<Placeholder>;
-        template class SimpleSet<const Function*>;
-
-        template bool operator==(const SimpleSet<Variable>& first, const SimpleSet<Variable>& second);
-        template bool operator==(const SimpleSet<Placeholder>& first, const SimpleSet<Placeholder>& second);
-
-#pragma endregion SimpleSet
-
-#pragma region SimpleMap
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>::SimpleMap()
-            : m_map(new std::unordered_map<KeyType, ValueType>())
-        {
-        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>::~SimpleMap()
-        {
-            delete m_map;
-        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>::SimpleMap(const SimpleMap& other)
-            : m_map(nullptr)
-        {
-            *this = other;
-        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>& SimpleMap<KeyType, ValueType>::operator=(const SimpleMap& other)
-        {
-            if (this != &other)
+            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
+            auto size = shapePtr->NumAxes();
+            stream << size;
+            for (auto i = 0; i < size; i++)
            {
-                delete m_map;
-                m_map = new std::unordered_map<KeyType, ValueType>(*(other.m_map));
+                stream << shapePtr->operator[](i);
            }
-
-            return *this;
+            break;
        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>::SimpleMap(SimpleMap&& other)
-            : m_map(nullptr)
+        case DictionaryValue::Type::Vector:
        {
-            *this = std::move(other);
+            vector<DictionaryValue>* vectorPtr =
+                reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
+            auto size = vectorPtr->size();
+            stream << size;
+            for (auto i = 0; i < size; i++)
+            {
+                stream << vectorPtr->operator[](i);
+            }
+            break;
        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleMap<KeyType, ValueType>& SimpleMap<KeyType, ValueType>::operator=(SimpleMap&& other)
-        {
-            assert(this != &other);
-
-            delete m_map;
-            m_map = other.m_map;
-            other.m_map = nullptr;
-
-            return *this;
+        default:
+            NOT_IMPLEMENTED;
        }
-
-        template <typename KeyType, typename ValueType>
-        ValueType& SimpleMap<KeyType, ValueType>::operator[](const KeyType& key)
-        {
-            return (*m_map)[key];
-        }
-
-        template <typename KeyType, typename ValueType>
-        const ValueType& SimpleMap<KeyType, ValueType>::operator[](const KeyType& key) const
-        {
-            return (*m_map)[key];
-        }
-
-        template <typename KeyType, typename ValueType>
-        bool SimpleMap<KeyType, ValueType>::Insert(const KeyType& key, const ValueType& value)
-        {
-            return m_map->insert({ key, value }).second;
-        }
-
-        template <typename KeyType, typename ValueType>
-        bool SimpleMap<KeyType, ValueType>::Contains(const KeyType& key) const
-        {
-            return (m_map->find(key) != m_map->end());
-        }
-
-        template <typename KeyType, typename ValueType>
-        size_t SimpleMap<KeyType, ValueType>::Size() const
-        {
-            return m_map->size();
-        }
-
-        template <typename KeyType, typename ValueType>
-        SimpleSet<KeyType> SimpleMap<KeyType, ValueType>::Keys() const
-        {
-            SimpleSet<KeyType> keys;
-            for (auto keyValuePair : *m_map)
-                keys.Insert(keyValuePair.first);
-
-            return keys;
-        }
-
-        // Explicit template instantiations
-        template class SimpleMap<Variable, ValuePtr>;
-        template class SimpleMap<Variable, const ValuePtr>;
-        template class SimpleMap<Placeholder, Variable>;
-
-#pragma endregion SimpleMap
-
+        return stream;
    }

    Dictionary::Dictionary()
-        : m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
+        : m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
    {
    }

@ -365,7 +149,7 @@ namespace CNTK
    Dictionary::Dictionary(Dictionary&& other)
        : m_dictionaryData(nullptr)
    {
-        *this = std::move(other);
+        *this = move(other);
    }

    Dictionary& Dictionary::operator=(Dictionary&& other)
@ -394,4 +178,130 @@ namespace CNTK
    {
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }
+
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    {
+        stream << us.version;
+        stream << us.m_dictionaryData->size();
+        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        {
+            stream << it->first;
+            stream << it->second;
+        }
+        return stream;
+    }
+
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    {
+        size_t version;
+        stream >> version;
+        size_t size;
+        stream >> size;
+        us.m_dictionaryData->reserve(size);
+        for (auto i = 0; i < size; i++)
+        {
+            wstring key;
+            stream >> key;
+            DictionaryValue value;
+            stream >> value;
+            us.m_dictionaryData->insert(make_pair(key, value));
+        }
+        return stream;
+    }
+
+    template <typename T>
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        vector<DictionaryValue> values(numElements);
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+            cpuDataViewPtr->CopyFrom(*viewPtr);
+        }
+
+        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            T v = buffer[i];
+            values[i] = DictionaryValue(v);
+        }
+
+        return values;
+    }
+
+    template <typename T>
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        if (values.size() != numElements)
+        {
+            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
+                        values.size(), numElements);
+        }
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+        }
+
+        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            buffer[i] = values[i].GetValue<T>();
+        }
+
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            viewPtr->CopyFrom(*cpuDataViewPtr);
+        }
+    }
+
+    // TODO: we store the type info for every element in the vector, which is extremely redundant.
+    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            return SerializeToVector<float>(viewPtr);
+        case DataType::Double:
+            return SerializeToVector<double>(viewPtr);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            DeserializeFromVector<float>(viewPtr, values);
+            break;
+        case DataType::Double:
+            DeserializeFromVector<double>(viewPtr, values);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+     
+    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
+    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -15,245 +15,6 @@ namespace CNTK
    // Forward declarations
    class Dictionary;

-    class DictionaryValue
-    {
-    public:
-        enum class Type : unsigned int
-        {
-            None,
-            Bool,
-            SizeT,
-            Double,
-            NDShape,
-            Vector
-        };
-
-        static const char* TypeName(Type type)
-        {
-            if (type == Type::None)
-                return "None";
-            else if (type == Type::Bool)
-                return "Bool";
-            else if (type == Type::SizeT)
-                return "SizeT";
-            else if (type == Type::Double)
-                return "Double";
-            else if (type == Type::NDShape)
-                return "NDShape";
-            else if (type == Type::Vector)
-                return "Vector";
-            else
-                LogicError("Unknown DictionaryValue::Type");
-        }
-
-    public:
-        DictionaryValue()
-            : m_valueType(Type::None)
-        {
-        }
-
-        DictionaryValue(bool value)
-            : m_valueType(GetValueType<bool>())
-        {
-            m_data.m_boolean = value;
-        }
-
-        DictionaryValue(size_t value)
-            : m_valueType(GetValueType<size_t>())
-        {
-            m_data.m_sizeT = value;
-        }
-
-        DictionaryValue(double value)
-            : m_valueType(GetValueType<double>())
-        {
-            m_data.m_double = value;
-        }
-
-        template <typename T>
-        DictionaryValue(const T& value)
-            : m_valueType(GetValueType<T>())
-        {
-            static_assert(std::is_same<T, NDShape>::value ||
-                std::is_same<T, Internal::SimpleVector<DictionaryValue>>::value,
-                "Unsupported ValueType");
-
-            AllocateDataPtr(value);
-        }
-
-        DictionaryValue(const DictionaryValue& other)
-            : m_valueType(Type::Bool)
-        {
-            // The m_valueType must hvae been set to a non-ptr type to prevent an attempt to interpret
-            // the underlying underlying uninitialized value as a ptr and free it.
-            *this = other;
-        }
-
-        DictionaryValue& operator=(const DictionaryValue& other)
-        {
-            if (this != &other)
-            {
-                FreeDataPtr();
-
-                m_valueType = other.m_valueType;
-                m_data = other.m_data;
-
-                if (other.m_valueType == Type::NDShape)
-                    AllocateDataPtr(other.GetValue<NDShape>());
-                else if (other.m_valueType == Type::Vector)
-                    AllocateDataPtr(other.GetValue<Internal::SimpleVector<DictionaryValue>>());
-            }
-
-            return *this;
-        }
-
-        ~DictionaryValue()
-        {
-            FreeDataPtr();
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_boolean;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_sizeT;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_double;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, Internal::SimpleVector<DictionaryValue>>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return *(reinterpret_cast<T*>(m_data.m_ptr));
-        }
-
-        bool HasValue() const
-        {
-            return m_valueType != Type::None;
-        }
-
-        Type ValueType() const
-        {
-            return m_valueType;
-        }
-
-    private:
-        template <typename T>
-        static Type GetValueType()
-        {
-            static_assert(std::is_same<T, bool>::value ||
-                std::is_same<T, size_t>::value ||
-                std::is_same<T, double>::value ||
-                std::is_same<T, NDShape>::value ||
-                std::is_same<T, Internal::SimpleVector<DictionaryValue>>::value ||
-                std::is_same<T, CNTK::Dictionary>::value,
-                "Unsupported ValueType");
-
-            if (std::is_same<T, bool>::value)
-                return Type::Bool;
-            else if (std::is_same<T, size_t>::value)
-                return Type::SizeT;
-            else if (std::is_same<T, double>::value)
-                return Type::Double;
-            else if (std::is_same<T, NDShape>::value)
-                return Type::NDShape;
-            else if (std::is_same<T, Internal::SimpleVector<DictionaryValue>>::value)
-                return Type::Vector;
-        }
-
-        template <typename T>
-        void VerifyType() const
-        {
-            if (GetValueType<T>() != m_valueType)
-                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
-        }
-
-        template <typename T>
-        void AllocateDataPtr(const T& value)
-        {
-            static_assert(std::is_same<T, NDShape>::value || std::is_same<T, Internal::SimpleVector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
-            m_data.m_ptr = new T(value);
-        }
-
-        template <typename T>
-        void FreePtrAsType()
-        {
-            T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
-            delete typedPtr;
-
-            m_data.m_ptr = nullptr;
-        }
-
-        void FreeDataPtr()
-        {
-            if (m_valueType == Type::NDShape)
-                FreePtrAsType<NDShape>();
-            else if (m_valueType == Type::Vector)
-                FreePtrAsType<Internal::SimpleVector<DictionaryValue>>();
-        }
-
-    private:
-        Type m_valueType;
-
-        union ValueData
-        {
-            bool m_boolean;
-            size_t m_sizeT;
-            double m_double;
-            void* m_ptr;
-        } m_data;
-    };
-
-    class Dictionary
-    {
-    public:
-        Dictionary();
-        ~Dictionary();
-
-        // Disallow copy contruction and assignment
-        Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
-
-        Dictionary(Dictionary&& other);
-        Dictionary& operator=(Dictionary&& other);
-
-        DictionaryValue& operator[](const std::wstring& key)
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue& operator[](const wchar_t* key);
-
-        DictionaryValue operator[](const std::wstring& key) const
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue operator[](const wchar_t* key) const;
-
-        bool Contains(const std::wstring& key) const
-        {
-            return Contains(key.c_str());
-        }
-
-        bool Contains(const wchar_t* key) const;
-
-    private:
-        std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
-    };
-
    // Helper to get the size of an element of the specified DataType
    inline size_t ElementSize(DataType dataType)
    {
@ -275,7 +36,7 @@ namespace CNTK
            NOT_IMPLEMENTED;
    }

-    inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKMatrixFormat(StorageFormat storageFormat)
+    inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKImplMatrixFormat(StorageFormat storageFormat)
    {
        if (storageFormat == StorageFormat::Dense)
            return Microsoft::MSR::CNTK::MatrixFormat::matrixFormatDense;
@ -358,4 +119,13 @@ namespace CNTK

        return{ matrixRowSize, matrixColSize };
    }
+
+    inline bool IsSparseInput(const Variable& var)
+    {
+        return var.IsInput() && var.IsSparse();
+    }
+
+    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
 }
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -21,15 +21,15 @@ namespace CNTK
            auto maskShape = mask->Shape();

            if (maskShape.NumAxes() > dataShape.NumAxes())
-                InvalidArgument("The number of axes of the mask of a Value object cannot exceed the number of axes of the data NDArrayView object");
+                InvalidArgument("The number of axes (%d) of the mask of a Value object cannot exceed the number of axes (%d) of the data NDArrayView object", (int)maskShape.NumAxes(), (int)dataShape.NumAxes());

            if (dataShape.SubShape(dataShape.NumAxes() - maskShape.NumAxes()) != maskShape)
-                InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data do not match the dimensions of the mask");
+                InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data (%S) do not match the dimensions of the mask (%S)", dataShape.AsString().c_str(), maskShape.AsString().c_str());
        }
    }

    template <typename T>
-    static NDMaskPtr CreateMask(size_t sampleSize, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
+    static NDMaskPtr CreateMask(size_t numElementsPerSample, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
    {
        size_t numSequences = sequences.size();
        std::vector<size_t> sequenceLengths(numSequences);
@ -37,7 +37,7 @@ namespace CNTK
        bool needsMask = false;
        for (size_t i = 0; i < numSequences; ++i)
        {
-            sequenceLengths[i] = sequences[i].size() / sampleSize;
+            sequenceLengths[i] = sequences[i].size() / numElementsPerSample;

            if (maxSequenceLength < sequenceLengths[i])
                maxSequenceLength = sequenceLengths[i];
@ -46,11 +46,12 @@ namespace CNTK
                needsMask = true;
        }

+        // If needed, create a mask to account for variability in lengths of specified sequences
        NDMaskPtr deviceValueMask;
        if (needsMask)
        {
            NDShape valueMaskShape = { maxSequenceLength, numSequences };
-            deviceValueMask = NDMaskPtr(new NDMask(valueMaskShape, device), [](Internal::ReferenceCount* ptr) {delete ptr; });
+            deviceValueMask = MakeSharedObject<NDMask>(valueMaskShape, device);
            for (size_t i = 0; i < numSequences; ++i)
                deviceValueMask->MaskSection({ sequenceLengths[i], i }, { NDShape::InferredDimension, 1 });
        }
@ -87,23 +88,23 @@ namespace CNTK
        }

        colStarts[numSequences * maxSequenceLength] = (SparseIndexType)(nonZeroValues.size());
-        NDArrayViewPtr deviceValueData(new NDArrayView(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly), [](ReferenceCount* ptr) { delete ptr; });
-        return ValuePtr(new Value(deviceValueData, deviceValueMask), [](ReferenceCount* ptr) { delete ptr; });
+        NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
+        return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
    }

    template <typename ElementType>
    /*static*/ ValuePtr Value::Create(const NDShape& sampleShape, const std::vector<std::vector<ElementType>>& sequences, const DeviceDescriptor& device, bool readOnly/* = false*/)
    {
-        size_t sampleSize = sampleShape.TotalSize();
-        NDMaskPtr deviceValueMask = CreateMask(sampleSize, sequences, device);
+        size_t numElementsPerSample = sampleShape.TotalSize();
+        NDMaskPtr deviceValueMask = CreateMask(numElementsPerSample, sequences, device);
        size_t maxSequenceLength = (deviceValueMask == nullptr) ? sequences[0].size() : deviceValueMask->Shape()[0];

        size_t numSequences = sequences.size();
        NDShape valueDataShape = sampleShape.AppendShape({ maxSequenceLength, numSequences });
-        NDArrayViewPtr valueData(new NDArrayView(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice()), [](ReferenceCount* ptr) { delete ptr; });
+        NDArrayViewPtr valueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice());
        ElementType* dataBuffer = valueData->WritableDataBuffer<ElementType>();
        for (size_t i = 0; i < numSequences; ++i)
-            std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * sampleSize));
+            std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * numElementsPerSample));

        NDArrayViewPtr deviceValueData;
        if (device == DeviceDescriptor::CPUDevice())
@ -115,13 +116,13 @@ namespace CNTK
        }
        else
        {
-            deviceValueData = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), valueDataShape, device), [](ReferenceCount* ptr) { delete ptr; });
+            deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, device);
            deviceValueData->CopyFrom(*valueData);
            if (readOnly)
                deviceValueData = deviceValueData->Alias(true);
        }

-        return ValuePtr(new Value(deviceValueData, deviceValueMask), [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
    }

    /*virtual*/ Value::~Value()
@ -143,13 +144,13 @@ namespace CNTK
    /*virtual*/ ValuePtr Value::DeepClone(bool readOnly/* = false*/) const
    {
        // TODO: Check if this is a derived type and throw an exception in that case
-        return ValuePtr(new Value(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr), [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr);
    }

    /*virtual*/ ValuePtr Value::Alias(bool readOnly/* = false*/) const
    {
        // TODO: Check if this is a derived type and throw an exception in that case
-        return ValuePtr(new Value(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr), [](ReferenceCount* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr);
    }

    /*virtual*/ void Value::CopyFrom(const Value& source)
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -11,4 +11,9 @@ namespace CNTK
        : Variable(function->Output())
    {
    }
+
+    FunctionPtr Variable::Owner() const 
+    {
+        return m_dataFields->m_ownerFunction->shared_from_this(); 
+    }
 }
--- a/Source/Common/BestGpu.cpp
+++ b/Source/Common/BestGpu.cpp
@ -106,22 +106,18 @@ public:
    ~BestGpu();
    void Init();
    void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
-    bool DeviceAllowed(int device);
-    void DisallowDevice(int device)
-    {
-        assert((device >= -1) && (device <= 31));
-
-        if (device < 0)
-            m_disallowCPUDevice = true;
-        else
-            m_allowedDevices &= ~(1 << device);
-    }
+    bool DeviceAllowed(int deviceId);
+    void DisallowUnsupportedDevices();
+    void DisallowDevice(int deviceId);
    void AllowAll();                                                                          // reset to allow all GPUs (no allowed list)
    bool UseMultiple();                                                                       // using multiple GPUs?
    int GetDevice(BestGpuFlags flags = bestGpuNormal);                                        // get a single device
    static const int AllDevices = -1;                                                         // can be used to specify all GPUs in GetDevices() call
    static const int RequeryDevices = -2;                                                     // Requery refreshing statistics and picking the same number as last query
+    static const int MininumCCMajorForGpu = 3;                                                // cntk supports GPUs with Compute Capability > 3.0
    std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
+    std::vector<ProcessorData *> GetProcessorData();
+
 private:
    bool LockDevice(int deviceId, bool trial = true);
 };
@ -156,6 +152,8 @@ static DEVICEID_TYPE SelectDevice(DEVICEID_TYPE deviceId, bool bLockGPU, const i
                {
                    g_bestGpu->DisallowDevice(excludedDevices[i]);
                }
+
+                g_bestGpu->DisallowUnsupportedDevices();
            }

            bestDeviceId = (DEVICEID_TYPE)g_bestGpu->GetDevice(BestGpuFlags(bLockGPU ? (bestGpuAvoidSharing | bestGpuExclusiveLock) : bestGpuAvoidSharing));
@ -345,22 +343,32 @@ int BestGpu::GetDevice(BestGpuFlags bestFlags)
 void BestGpu::SetAllowedDevices(const std::vector<int>& devices)
 {
    m_allowedDevices = 0;
-    for (int device : devices)
+    for (int deviceId : devices)
    {
-        m_allowedDevices |= (1 << device);
+        m_allowedDevices |= (1 << deviceId);
    }
 }

 // DeviceAllowed - is a particular device allowed?
 // returns: true if the device is allowed, otherwise false
-bool BestGpu::DeviceAllowed(int device)
+bool BestGpu::DeviceAllowed(int deviceId)
 {
-    assert((device >= -1) && (device <= 31));
+    assert((deviceId >= -1) && (deviceId <= 31));

-    if (device < 0)
+    if (deviceId < 0)
        return !m_disallowCPUDevice;
    else
-        return !!(m_allowedDevices & (1 << device));
+        return !!(m_allowedDevices & (1 << deviceId));
+}
+
+void BestGpu::DisallowDevice(int deviceId)
+{
+    assert((deviceId >= -1) && (deviceId <= 31));
+
+    if (deviceId < 0)
+        m_disallowCPUDevice = true;
+    else
+        m_allowedDevices &= ~(1 << deviceId);
 }

 // AllowAll - Reset the allowed filter to allow all GPUs
@ -527,6 +535,68 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
    return best; // return the array of the best GPUs
 }

+// disallow devices wich don't comply with compute capability restriction when cntk runs with deviceId = 'auto'
+void BestGpu::DisallowUnsupportedDevices()
+{
+    for (auto pd : m_procData)
+    {
+        if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
+        {
+            DisallowDevice(pd->deviceId);
+        }
+    }
+}
+
+GpuData GetGpuData(DEVICEID_TYPE deviceId)
+{
+    std::vector<GpuData> gpusData = GetAllGpusData();
+
+    auto it = std::find_if(gpusData.begin(), gpusData.end(), [&deviceId](const GpuData& gpu){return gpu.deviceId == deviceId;});
+
+    if (it != gpusData.end())
+    {
+        return *it;
+    }
+
+    return GpuData(0, 0, deviceId, 0, GpuValidity::UnknownDevice, "", 0);
+}
+
+// populate a vector with data (id, major/minor version, cuda cores, name and memory) for each gpu device in the machine
+std::vector<GpuData> GetAllGpusData()
+{
+    std::vector<GpuData> data;
+
+    auto bestGpu = make_unique<BestGpu>();
+
+    std::vector<ProcessorData*> processorData = bestGpu->GetProcessorData();
+    
+    for (ProcessorData* pd : processorData)
+    {
+
+        GpuValidity validity = GpuValidity::UnknownDevice;
+
+        if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
+        {
+            validity = GpuValidity::ComputeCapabilityNotSupported;
+        }
+        else
+        {
+            validity = GpuValidity::Valid;
+        }
+
+        size_t totalMemory = pd->deviceProp.totalGlobalMem/(1024*1024); //From bytes to MBytes
+        GpuData gpuData = GpuData(pd->deviceProp.major, pd->deviceProp.minor, pd->deviceId, pd->cores, validity, string(pd->deviceProp.name), totalMemory);
+        data.push_back(gpuData);
+    }
+
+    return data;
+}
+
+std::vector<ProcessorData*> BestGpu::GetProcessorData()
+{
+    return m_procData;
+}
+
 // QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters,
 // In case failure, this function simply backs out without filling in the data structure and without setting m_nvmlData.
 void BestGpu::QueryNvmlData()
--- a/Source/Common/Include/BestGpu.h
+++ b/Source/Common/Include/BestGpu.h
@ -8,15 +8,46 @@
 // #define CPUONLY      // #define this to build without GPU support nor needing the SDK installed
 #include "CommonMatrix.h"

+#include <vector>
+
 // define IConfigRecord and ConfigParameters as incomplete types, in order to avoid having to include "ScriptableObjects.h" and "Config.h", as that confuses some .CU code
 namespace Microsoft { namespace MSR { namespace ScriptableObjects { struct IConfigRecord; }}}

 namespace Microsoft { namespace MSR { namespace CNTK {

+using namespace std;
+
 #ifndef CPUONLY
+enum class GpuValidity
+{
+    Valid,
+    UnknownDevice,
+    ComputeCapabilityNotSupported
+};
+
+struct GpuData
+{
+    int versionMajor;
+    int versionMinor;
+    int deviceId;
+    int cudaCores;
+    GpuValidity validity;
+    string name;
+    size_t totalMemory;
+    GpuData(int versionMajor, int versionMinor, int deviceId, int cudaCores, GpuValidity validity, const string& name, size_t totalMemory)
+        :versionMajor(versionMajor), versionMinor(versionMinor), deviceId(deviceId), cudaCores(cudaCores), validity(validity), name(name), totalMemory(totalMemory)
+    {
+    }
+
+};
+
+std::vector<GpuData> GetAllGpusData();
+GpuData GetGpuData(DEVICEID_TYPE deviceId);
+
 class ConfigParameters;
 DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config);
 DEVICEID_TYPE DeviceFromConfig(const ScriptableObjects::IConfigRecord& config);
+
 #else
 template <class ConfigRecordType>
 static inline DEVICEID_TYPE DeviceFromConfig(const ConfigRecordType& /*config*/)
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -282,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
        Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
        {
            if (maxLengths.size() != size())
-                throw std::exception("Expected max lengths for all variables.");
+                throw std::runtime_error("Expected max lengths for all variables.");

            Values<ElemType> buffers(size());
            for (size_t i = 0; i < size(); ++i)
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -29,7 +29,8 @@ public:
        runtime_error(msg)
    {
    }
-    virtual void PrintError(const std::wstring& linePrefix) const = 0;
+    virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
+    virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
 };

 // -----------------------------------------------------------------------
--- a/Source/Readers/HTKMLFReader/basetypes.h
+++ b/Source/Readers/HTKMLFReader/basetypes.h
@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()

 // execute a block with retry
 // Block must be restartable.
-// Use this when writing small files to those unreliable Windows servers.
+// Use this when writing/reading small files to those unreliable Windows servers.
 // TODO: This will fail to compile under VS 2008--we need an #ifdef around this
 template <typename FUNCTION>
 static void attempt(int retries, const FUNCTION &body)
--- a/Source/Common/Include/fileutil.h
+++ b/Source/Common/Include/fileutil.h
@ -30,6 +30,7 @@
 #include <assert.h>
 #include <string.h>  // for strerror()
 #include <stdexcept> // for exception
+#include <fcntl.h>

 // ----------------------------------------------------------------------------
 // fopenOrDie(): like fopen() but terminate with err msg in case of error.
@ -591,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
 void fgetfile(FILE* f, std::vector<char>& buffer);
 namespace msra { namespace files {

-void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
+void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
+
 static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
 {
    std::vector<char> buffer;
@ -599,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
    fgetfilelines(pathname, buffer, lines);
    return lines;
 }
-std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
+std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);

 }}

@ -698,8 +700,18 @@ class auto_file_ptr
    {
        if (f && f != stdin && f != stdout && f != stderr)
        {
+            bool readMode = false;
+
+#ifdef _WIN32
+            if ((f->_flag&_IOREAD) == _IOREAD)
+                readMode = true;
+#else
+            int mode = fcntl(fileno(f), F_GETFL);
+            if ((mode & O_ACCMODE) == O_RDONLY)
+                readMode = true;
+#endif
            int rc = ::fclose(f);
-            if ((rc != 0) && !std::uncaught_exception())
+            if (!readMode && (rc != 0) && !std::uncaught_exception())
                RuntimeError("auto_file_ptr: failed to close file: %s", strerror(errno));

            f = NULL;
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -1251,7 +1251,7 @@ public:
        // BUGBUG: we only really support one archive file at this point
        // read the TOC in one swoop
        std::vector<char> textbuffer;
-        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
+        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);

        // parse it one by one
        size_t archiveindex = SIZE_MAX; // its index
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -16,6 +16,7 @@
 #endif
 #define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
 #include "Basics.h"
+#include "basetypes.h" //for attemp()
 #include "fileutil.h"
 #include "ProgressTracing.h"

@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
    return len;
 }

+static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
+{
+    len = fgetfilechars(path, buffer);
+}
+
 template <class LINES>
 static void strtoklines(char* s, LINES& lines)
 {
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
        lines.push_back(p);
 }

-void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
+void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });

    // parse into lines
    lines.resize(0);
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
 }

 // same as above but returning const char* (avoiding the memory allocation)
-vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
+vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
-
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });
+    
    // parse into lines
    vector<char*> lines;
    lines.reserve(len / 20);
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -72,6 +72,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(InvStdDevNode))                        return New<InvStdDevNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(KhatriRaoProductNode))                 return New<KhatriRaoProductNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogNode))                              return New<LogNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(LogPlusNode))                          return New<LogPlusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogSoftmaxNode))                       return New<LogSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LookupTableNode))                      return New<LookupTableNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(MatrixL1RegNode))                      return New<MatrixL1RegNode<ElemType>>(forward<_Types>(_Args)...);
@ -657,6 +658,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Plus(
    return net.AddNodeToNetAndAttachInputs(New<PlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<LogPlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Less(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -134,6 +134,7 @@ public:
    ComputationNodePtr InvStdDev(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Log(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
    ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -76,7 +76,7 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,

    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);

-    for (const auto& fromNode : GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
+    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
    {
        wstring fromNodeName = fromNode->NodeName();
        wstring toNodeName = toNamePrefix + fromNodeName;
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -67,6 +67,8 @@ template class PlusNode<double>;

 // -----------------------------------------------------------------------
 // LogPlusNode (summand1, summand2)
+// Computes ln(exp(summand1) + exp(summand2)) in an overflow safe way.
+// Useful e.g. for computing softmax over sequence.
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -105,8 +107,16 @@ public:
        if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
            Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);

-        // TODO: would be nice to state the derivative here in a comment
-        inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
+        if (inputIndex == 0)
+        {
+            // d/dx (ln( exp(x) + (exp(y)) = exp(x) / (exp(x) + exp(y)) = 1 / (1 + exp(y-x)) = sigmoid(x-y)
+            inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input1, input0);
+        }
+        else
+        {
+            // d/dy (ln( exp(x) + (exp(y)) = exp(y) / (exp(x) + exp(y)) = 1 / (1 + exp(x-y)) = sigmoid(y-x)
+            inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
+        }
    }
 };

--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -321,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());

    size_t i = 0;
-    for (auto& input : m_inputMatrices)
+    for (auto& inputNode : m_inputNodes)
    {
        // const cast: The matrix class takes this over without copying and could theoretically change the contents,
        // though it doesn't in this case.
        auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
-        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
+        auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
        auto type = matrix->GetMatrixType();
-        size_t numRows = input.second.sampleLayout.GetNumElements();
+        size_t numRows = inputNode->GetSampleLayout().GetNumElements();

+        if (buffer.m_buffer.data() == nullptr)
+            RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
        if (type == MatrixType::DENSE)
        {
            if (buffer.m_buffer.size() % numRows != 0)
@ -340,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        }
        else if (type == MatrixType::SPARSE)
        {
+            if (buffer.m_colIndices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
+            if (buffer.m_indices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices.size() < 2)
-                RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
+                RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[0] != 0)
                RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
@ -352,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem

        int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
        assert(numCols >= 1);
-        input.second.pMBLayout->Init(1, numCols);
-        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
+        inputNode->GetMBLayout()->Init(1, numCols);
+        inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);

        if (type == MatrixType::DENSE)
            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
--- a/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
@ -14,6 +14,11 @@
 #include <msclr\marshal_cppstd.h>

 #include "CNTKException.h"
+#pragma warning(push)
+#pragma warning(disable : 4793) // Function compiled as native
+#include "Basics.h"
+#include "ScriptableObjects.h"
+#pragma warning(pop)
 #include "EvalCommon.h"
 #include "Eval.h"

@ -172,23 +177,10 @@ public:
    /// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
    ModelEvaluationExtended(String^ funcName)
    {
-        auto dir = System::IO::Path::GetDirectoryName(System::Reflection::Assembly::GetExecutingAssembly()->Location);
-        auto dllFileName = System::IO::Path::Combine(dir, "evaldll.dll");
-        pin_ptr<const WCHAR> dllname = PtrToStringChars(dllFileName);
-        auto hModule = LoadLibrary(dllname);
-        if (hModule == nullptr)
-        {
-            throw gcnew CNTKException(System::String::Format("Cannot find library: {0}", gcnew String(dllname)));
-        }
-
        try
        {
-            msclr::interop::marshal_context context;
-            const std::string func = context.marshal_as<std::string>(funcName);
-            auto procAddress = GetProcAddress(hModule, func.c_str());
-            auto getEvalProc = (GetEvalProc<ElemType>)procAddress;
            pin_ptr <IEvaluateModelExtended<ElemType>*> p_eval = &m_eval;
-            getEvalProc(p_eval);
+            GetEvalExtended<ElemType>(p_eval);
        }
        catch (const exception& ex)
        {
@ -263,7 +255,14 @@ public:
            outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
        }

-        m_eval->StartForwardEvaluation(outputNodeNames);
+        try
+        {
+            m_eval->StartForwardEvaluation(outputNodeNames);
+        }
+        catch (const exception& ex)
+        {
+            throw GetCustomException(ex);
+        }
    }

    //
@ -367,6 +366,11 @@ private:
        {
            return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
        }
+        else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
+        {
+            const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
+            return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
+        }
        else
        {
            return gcnew CNTKException(gcnew System::String(ex.what()));
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
@ -43,21 +43,10 @@ public:
    /// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
    IEvaluateModelManaged(String^ funcName)
    {
-        pin_ptr<const WCHAR> dllname = PtrToStringChars("evaldll.dll");
-        auto hModule = LoadLibrary(dllname);
-        if (hModule == nullptr)
-        {
-            throw gcnew CNTKException(System::String::Format("Cannot find library: {0}", gcnew String(dllname)));
-        }
-
        try
        {
-            msclr::interop::marshal_context context;
-            const std::string func = context.marshal_as<std::string>(funcName);
-            auto procAddress = GetProcAddress(hModule, func.c_str());
-            auto getEvalProc = (GetEvalProc<ElemType>)procAddress;
            pin_ptr <IEvaluateModel<ElemType>*> p_eval = &m_eval;
-            getEvalProc(p_eval);
+            GetEval<ElemType>(p_eval);
        }
        catch (const exception& ex)
        {
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -56,6 +56,8 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
@ -66,8 +68,6 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -77,8 +77,6 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
--- a/Source/Math/BlockMultiplier.h
+++ b/Source/Math/BlockMultiplier.h
@ -9,6 +9,7 @@
 #include <emmintrin.h>
 #include <tmmintrin.h>
 #include <immintrin.h>
+#include <smmintrin.h>
 #include <assert.h>
 #include <cstdint>
 #include <iostream>
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -116,6 +116,44 @@ const char* CudaErrString<curandStatus>(curandStatus)

 namespace Microsoft { namespace MSR { namespace CNTK {

+/*static*/ bool SyncGuard::s_isSyncEnabled = false;
+
+/*static*/ void SyncGuard::EnableSync()
+{
+    s_isSyncEnabled = true;
+}
+
+SyncGuard::SyncGuard(bool forceSync /*= false*/)
+    : m_forceSync(forceSync)
+{
+    m_done = nullptr;
+    if (m_forceSync || s_isSyncEnabled)
+    {
+        CUDA_CALL(cudaGetLastError());
+        CUDA_CALL(cudaEventCreate(&m_done));
+    }
+}
+
+SyncGuard::~SyncGuard()
+{
+    if (m_forceSync || s_isSyncEnabled)
+    {
+        // The regular use of this destructor is to synchronize the GPU, but also
+        // to check for errors. So this destructor is where CUDA errors would be thrown.
+        // If this destructor runs during stack unwinding, then a different error has
+        // already happened that should be reported; so we only clean up the resource.
+        if (std::uncaught_exception())
+            cudaEventDestroy(m_done);
+        else
+        {
+            // failures in a prior launch might be reported here
+            CUDA_CALL(cudaEventRecord(m_done));
+            CUDA_CALL(cudaEventSynchronize(m_done));
+            CUDA_CALL(cudaEventDestroy(m_done));
+        }
+    }
+}
+
 template <typename AllocatedElemType>
 AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols)
 {
@ -4278,11 +4316,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
 template <class ElemType>
 static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
 {
-    // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
-    // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
-    static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
-    if (deviceId >= _countof(onesCache))
-        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
+    // using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
+    // and using shared_ptrs since we don't want to leak more than CacheSize elements
+    // when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
+    const int CacheSize = 32;
+    static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
+
+    if (deviceId >= CacheSize){
+        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
+    }
+
    auto p = onesCache[deviceId];
    if (!p || p->GetNumRows() < N) // must (re-)allocate
    {
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -61,6 +61,27 @@ cudaStream_t MATH_API GetStream();

 namespace Microsoft { namespace MSR { namespace CNTK {

+// -----------------------------------------------------------------------
+// SyncGuard -- synchronize around CUDA calls
+// -----------------------------------------------------------------------
+
+class SyncGuard
+{
+private:
+    static bool s_isSyncEnabled;
+
+    bool m_forceSync;
+#ifndef CPUONLY
+    cudaEvent_t m_done;
+#endif
+
+public:
+    static MATH_API void EnableSync();
+
+    SyncGuard(bool forceSync = false);
+    ~SyncGuard();
+};
+
 // -----------------------------------------------------------------------
 // DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
 // -----------------------------------------------------------------------
@ -623,51 +644,4 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
 #define CURAND_CALL(expr)   (CudaCall((expr), #expr, "CURAND",   CURAND_STATUS_SUCCESS))
 #define CUDNN_CALL(expr)    (CudaCall((expr), #expr, "cuDNN",    CUDNN_STATUS_SUCCESS))

-// -----------------------------------------------------------------------
-// SyncGuard -- synchronize around CUDA calls
-// -----------------------------------------------------------------------
-
-class SyncGuard
-{
-    static bool DoSync()
-    {
-#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
-        static bool do_sync = false;
-#else
-        static bool do_sync = true;
-#endif
-        return do_sync;
-    }
-    cudaEvent_t m_done;
-public:
-    SyncGuard()
-    {
-        m_done = nullptr;
-        if (DoSync())
-        {
-            CUDA_CALL(cudaGetLastError());
-            CUDA_CALL(cudaEventCreate(&m_done));
-        }
-    }
-    ~SyncGuard()
-    {
-        if (DoSync())
-        {
-            // The regular use of this destructor is to synchronize the GPU, but also
-            // to check for errors. So this destructor is where CUDA errors would be thrown.
-            // If this destructor runs during stack unwinding, then a different error has
-            // already happened that should be reported; so we only clean up the resource.
-            if (std::uncaught_exception())
-                cudaEventDestroy(m_done);
-            else
-            {
-                // failures in a prior launch might be reported here
-                CUDA_CALL(cudaEventRecord(m_done));
-                CUDA_CALL(cudaEventSynchronize(m_done));
-                CUDA_CALL(cudaEventDestroy(m_done));
-            }
-        }
-    }
-};
-
 #endif // CPUONLY
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -2276,6 +2276,9 @@ float CudaTimer::Elapsed()
    return 0;
 }

+/*static*/ void SyncGuard::EnableSync()
+{
+}
 } } }

 // define a dummy GPUWatcher class too
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -14,6 +14,10 @@
 #pragma warning(push)
 #pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.

+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
+    template <class ElemType> struct TensorTest;
+}}}}
+
 // This class is exported from the Math.dll.
 namespace Microsoft { namespace MSR { namespace CNTK {

@ -148,7 +152,8 @@ private:
    // -------------------------------------------------------------------

    const Matrix<ElemType>& GetSOB() const { return *m_sob; }
-    Matrix<ElemType>&       GetSOB()       { return *m_sob; }
+    Matrix<ElemType>&       GetSOB()       { return *m_sob; }    
+    friend Test::TensorTest<ElemType>;

    // -------------------------------------------------------------------
    // sob members
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
@ -110,9 +110,6 @@
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
  </ItemGroup>
-  <ItemGroup>
-    <None Include="uci_to_cntk_text_format_converter.py" />
-  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
 </Project>
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
@ -47,13 +47,5 @@
    <Filter Include="Common\Include">
      <UniqueIdentifier>{C6F55578-121A-4D7C-8F57-4172BC5C463B}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Scripts">
-      <UniqueIdentifier>{cd70d891-88aa-40a4-8e47-0e31e4cac48e}</UniqueIdentifier>
-    </Filter>
  </ItemGroup>
-  <ItemGroup>
-    <None Include="uci_to_cntk_text_format_converter.py">
-      <Filter>Scripts</Filter>
-    </None>
-  </ItemGroup>
-</Project>
+</Project>
--- a/Source/Readers/HTKDeserializers/HTKChunkDescription.h
+++ b/Source/Readers/HTKDeserializers/HTKChunkDescription.h
@ -78,6 +78,12 @@ public:
        return &m_utterances[index];
    }

+    // Get utterance description by its index.
+    UtteranceDescription* GetUtterance(size_t index)
+    {
+        return &m_utterances[index];
+    }
+
    // Get start frame index inside chunk.
    size_t GetStartFrameIndexInsideChunk(size_t index) const
    {
@ -105,7 +111,7 @@ public:
        }

        const size_t ts = m_firstFrames[index];
-        const size_t n = GetUtterance(index)->GetNumberOfFrames();
+        const size_t n = m_utterances[index].GetNumberOfFrames();
        return msra::dbn::matrixstripe(m_frames, ts, n);
    }

--- a/Source/Readers/HTKDeserializers/HTKDataDeserializer.cpp
+++ b/Source/Readers/HTKDeserializers/HTKDataDeserializer.cpp
@ -9,7 +9,6 @@
 #include "HTKDataDeserializer.h"
 #include "ConfigHelper.h"
 #include "Basics.h"
-#include <numeric>

 // TODO: This will be removed when dependency on old code is eliminated.
 // Currently this fixes the linking.
@ -46,6 +45,12 @@ HTKDataDeserializer::HTKDataDeserializer(
    ConfigParameters input = inputs.front();
    auto inputName = input.GetMemberIds().front();

+    m_expandToPrimary = cfg(L"expandToUtterance", false);
+    if (m_expandToPrimary && m_primary)
+    {
+        InvalidArgument("Cannot expand utterances of the primary stream %ls, please change your configuration.", inputName.c_str());
+    }
+
    ConfigParameters streamConfig = input(inputName);

    ConfigHelper config(streamConfig);
@ -85,6 +90,12 @@ HTKDataDeserializer::HTKDataDeserializer(
    m_dimension = config.GetFeatureDimension();
    m_dimension = m_dimension * (1 + context.first + context.second);

+    m_expandToPrimary = feature(L"expandToUtterance", false);
+    if (m_expandToPrimary && m_primary)
+    {
+        InvalidArgument("Cannot expand utterances of the primary stream %ls, please change your configuration.", featureName.c_str());
+    }
+
    InitializeChunkDescriptions(config);
    InitializeStreams(featureName);
    InitializeFeatureInformation();
@ -118,6 +129,13 @@ void HTKDataDeserializer::InitializeChunkDescriptions(ConfigHelper& config)
        UtteranceDescription description(move(msra::asr::htkfeatreader::parsedpath(u)));
        size_t numberOfFrames = description.GetNumberOfFrames();

+        if (m_expandToPrimary && numberOfFrames != 1)
+        {
+            RuntimeError("Expanded stream should only contain sequences of length 1, utterance '%s' has %d",
+                description.GetKey().c_str(),
+                (int)numberOfFrames);
+        }
+
        // For logging, also account for utterances and frames that we skip
        allUtterances++;
        allFrames += numberOfFrames;
@ -470,7 +488,8 @@ void HTKDataDeserializer::GetSequenceById(ChunkIdType chunkId, size_t id, vector

    // wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()
    MatrixAsVectorOfVectors utteranceFramesWrapper(utteranceFrames);
-    FeatureMatrix features(m_dimension, m_frameMode ? 1 : utterance->GetNumberOfFrames());
+    size_t utteranceLength = m_frameMode ? 1  : (m_expandToPrimary ? utterance->GetExpansionLength() : utterance->GetNumberOfFrames());
+    FeatureMatrix features(m_dimension, utteranceLength);

    if (m_frameMode)
    {
@ -479,9 +498,16 @@ void HTKDataDeserializer::GetSequenceById(ChunkIdType chunkId, size_t id, vector
        auto fillIn = features.col(0);
        AugmentNeighbors(utteranceFramesWrapper, frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, fillIn);
    }
-    else
+    else if (m_expandToPrimary) // Broadcast a single frame to the complete utterance.
+    {
+        for (size_t resultingIndex = 0; resultingIndex < utterance->GetExpansionLength(); ++resultingIndex)
+        {
+            auto fillIn = features.col(resultingIndex);
+            AugmentNeighbors(utteranceFramesWrapper, 0, m_augmentationWindow.first, m_augmentationWindow.second, fillIn);
+        }
+    }
+    else // Augment the complete utterance.
    {
-        // Augment complete utterance.
        for (size_t frameIndex = 0; frameIndex < utterance->GetNumberOfFrames(); ++frameIndex)
        {
            auto fillIn = features.col(frameIndex);
@ -508,10 +534,10 @@ void HTKDataDeserializer::GetSequenceById(ChunkIdType chunkId, size_t id, vector
 }

 // Gets sequence description by its key.
-bool HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& d)
+bool HTKDataDeserializer::GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& d)
 {
    assert(!m_primary);
-    auto iter = m_keyToChunkLocation.find(key.m_sequence);
+    auto iter = m_keyToChunkLocation.find(primary.m_key.m_sequence);
    if (iter == m_keyToChunkLocation.end())
    {
        return false;
@ -519,11 +545,29 @@ bool HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, Sequen

    auto chunkId = iter->second.first;
    auto utteranceIndexInsideChunk = iter->second.second;
-    const auto& chunk = m_chunks[chunkId];
-    const auto& sequence = chunk.GetUtterance(utteranceIndexInsideChunk);
+    auto& chunk = m_chunks[chunkId];
+    auto utterance = chunk.GetUtterance(utteranceIndexInsideChunk);
+
    d.m_chunkId = (ChunkIdType)chunkId;
-    d.m_id = m_frameMode ? chunk.GetStartFrameIndexInsideChunk(utteranceIndexInsideChunk) + key.m_sample : utteranceIndexInsideChunk;
-    d.m_numberOfSamples = m_frameMode ? 1 : (uint32_t)sequence->GetNumberOfFrames();
+
+    // TODO: When we move frame mode from deserializer, expanding should go away and be done on the higher level.
+    // TODO: Currently for the frame mode the secondary deserializer does not know the size of the full utterance,
+    // becase each frame has its own sequence description. So we get the length by the max sample we have seen.
+    if (m_expandToPrimary)
+    {
+        // Expanding for sequence length/or max seen frame.
+        size_t maxLength = max(primary.m_numberOfSamples, (uint32_t)primary.m_key.m_sample + 1);
+        if (utterance->GetExpansionLength() < maxLength)
+        {
+            utterance->SetExpansionLength(maxLength);
+        }
+        d.m_id = utteranceIndexInsideChunk;
+    }
+    else
+    {
+        d.m_id = m_frameMode ? chunk.GetStartFrameIndexInsideChunk(utteranceIndexInsideChunk) + primary.m_key.m_sample : utteranceIndexInsideChunk;
+    }
+    d.m_numberOfSamples = m_frameMode ? 1 : (uint32_t)utterance->GetNumberOfFrames();
    return true;
 }

--- a/Source/Readers/HTKDeserializers/HTKDataDeserializer.h
+++ b/Source/Readers/HTKDeserializers/HTKDataDeserializer.h
@ -34,8 +34,8 @@ public:
    // Retrieves data for a chunk.
    virtual ChunkPtr GetChunk(ChunkIdType chunkId) override;

-    // Gets sequence description by its key.
-    virtual bool GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override;
+    // Gets sequence description by the primary one.
+    virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription&) override;

 private:
    class HTKChunk;
@ -84,6 +84,10 @@ private:
    unsigned int m_samplePeriod = 0;
    size_t m_ioFeatureDimension = 0;
    std::string m_featureKind;
+
+    // A flag that indicates whether the utterance should be extended to match the lenght of the utterance from the primary deserializer.
+    // TODO: This should be moved to the packers when deserializers work in sequence mode only.
+    bool m_expandToPrimary;
 };

 typedef std::shared_ptr<HTKDataDeserializer> HTKDataDeserializerPtr;
--- a/Source/Readers/HTKDeserializers/UtteranceDescription.h
+++ b/Source/Readers/HTKDeserializers/UtteranceDescription.h
@ -20,9 +20,12 @@ class UtteranceDescription
    // Utterance id.
    size_t m_id;

+    // Expansion length in case if utterance should be expanded.
+    size_t m_expansionLength;
+
 public:
    UtteranceDescription(msra::asr::htkfeatreader::parsedpath&& path)
-        : m_path(std::move(path))
+        : m_path(std::move(path)), m_expansionLength(0)
    {
    }

@ -48,6 +51,9 @@ public:

    size_t GetId() const  { return m_id; }
    void SetId(size_t id) { m_id = id; }
+
+    size_t GetExpansionLength() const { return m_expansionLength; }
+    void SetExpansionLength(size_t length) { m_expansionLength = length; }
 };

 }}}
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@ -107,7 +107,6 @@
    <ClInclude Include="..\..\Common\Include\DataReader.h" />
    <ClInclude Include="..\..\Common\Include\DataWriter.h" />
    <ClInclude Include="..\..\Common\Include\ssematrix.h" />
-    <ClInclude Include="basetypes.h" />
    <ClInclude Include="biggrowablevectors.h" />
    <ClInclude Include="chunkevalsource.h" />
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj.filters
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj.filters
@ -32,9 +32,6 @@
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
    <ClInclude Include="utterancesourcemulti.h" />
-    <ClInclude Include="basetypes.h">
-      <Filter>Duplicates to remove</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\Common\Include\DataReader.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
--- a/Source/Readers/HTKMLFReader/htkfeatio.h
+++ b/Source/Readers/HTKMLFReader/htkfeatio.h
@ -71,6 +71,11 @@ protected:
        const unsigned char* b = (const unsigned char*) &v;
        return (short) ((b[0] << 8) + b[1]);
    }
+	static unsigned short swapunsignedshort(unsigned short v) throw()
+	{
+		const unsigned char* b = (const unsigned char*)&v;
+		return (unsigned short)((b[0] << 8) + b[1]);
+	}
    static int swapint(int v) throw()
    {
        const unsigned char* b = (const unsigned char*) &v;
@ -81,13 +86,13 @@ protected:
    {
        int nsamples;
        int sampperiod;
-        short sampsize;
+        unsigned short sampsize;
        short sampkind;
        void read(FILE* f)
        {
            nsamples = fgetint(f);
            sampperiod = fgetint(f);
-            sampsize = fgetshort(f);
+            sampsize =(unsigned short) fgetshort(f);
            sampkind = fgetshort(f);
        }

@ -102,21 +107,24 @@ protected:
            sampkind = (short) 9; // user type
            int nRows = swapint(fgetint(f));
            int nCols = swapint(fgetint(f));
-            sampsize = (short) (nRows * nCols); // features are stored as bytes;
+            int rawsampsize = nRows * nCols;
+            if (rawsampsize > UINT16_MAX)
+                RuntimeError("reading idx feature cache header: sample size overflow");
+            sampsize = (unsigned short)rawsampsize; // features are stored as bytes;
        }

        void write(FILE* f)
        {
            fputint(f, nsamples);
            fputint(f, sampperiod);
-            fputshort(f, sampsize);
+            fputshort(f, (short) sampsize);
            fputshort(f, sampkind);
        }
        void byteswap()
        {
            nsamples = swapint(nsamples);
            sampperiod = swapint(sampperiod);
-            sampsize = swapshort(sampsize);
+			sampsize = swapunsignedshort(sampsize);
            sampkind = swapshort(sampkind);
        }
    };
@ -215,7 +223,10 @@ public:
        H.nsamples = 0; // unknown for now, updated in close()
        H.sampperiod = period;
        const int bytesPerValue = sizeof(float); // we do not support compression for now
-        H.sampsize = (short) featdim * bytesPerValue;
+        size_t rawsampsize = featdim * bytesPerValue;
+        if (rawsampsize > UINT16_MAX)
+            RuntimeError("htkfeatwriter: sample size overflow");
+        H.sampsize = (unsigned short)rawsampsize;
        H.sampkind = parsekind(kind);
        if (needbyteswapping)
            H.byteswap();
--- a/Source/Readers/ReaderLib/Bundler.cpp
+++ b/Source/Readers/ReaderLib/Bundler.cpp
@ -60,6 +60,12 @@ void Bundler::CreateChunkDescriptions()
        RuntimeError("Driving deserializer provided too many chunks.");
    }

+    // Creating a table of weak chunks for non driving deserializers.
+    for (size_t i = 0; i < m_deserializers.size(); ++i)
+    {
+        m_weakChunkTable.push_back(std::vector<std::weak_ptr<Chunk>>(m_deserializers[i]->GetChunkDescriptions().size()));
+    }
+
    m_chunks.reserve(chunks.size());

    if (m_verbosity)
@ -105,7 +111,7 @@ void Bundler::CreateChunkDescriptions()
            size_t sequenceSamples = sequence.m_numberOfSamples;
            for (size_t deserializerIndex = 1; deserializerIndex < m_deserializers.size(); ++deserializerIndex)
            {
-                isValid = m_deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequenceDescriptions[sequenceIndex].m_key, s);
+                isValid = m_deserializers[deserializerIndex]->GetSequenceDescription(sequenceDescriptions[sequenceIndex], s);
                if (!isValid)
                {
                    invalid.insert(sequenceIndex);
@ -193,7 +199,7 @@ void Bundler::GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDesc
            uint32_t sequenceSamples = sequence.m_numberOfSamples;
            for (size_t deserializerIndex = 1; deserializerIndex < m_deserializers.size(); ++deserializerIndex)
            {
-                m_deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequence.m_key, s);
+                m_deserializers[deserializerIndex]->GetSequenceDescription(sequence, s);
                sequenceSamples = std::max(sequenceSamples, s.m_numberOfSamples);
            }
            sequence.m_numberOfSamples = sequenceSamples;
@ -251,10 +257,9 @@ public:

        // Creating sequence mapping and requiring underlying chunks.
        SequenceDescription s;
-        for (size_t deserializerIndex = 1; deserializerIndex < m_parent->m_deserializers.size(); ++deserializerIndex)
+        for (size_t deserializerIndex = 1; deserializerIndex < deserializers.size(); ++deserializerIndex)
        {
-            std::map<size_t, ChunkPtr> secondaryChunks;
-
+            auto& chunkTable = m_parent->m_weakChunkTable[deserializerIndex];
            for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
            {
                if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
@ -263,19 +268,14 @@ public:
                }

                size_t currentIndex = sequenceIndex * deserializers.size() + deserializerIndex;
-                deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s);
+                deserializers[deserializerIndex]->GetSequenceDescription(sequences[sequenceIndex], s);
                m_sequenceToSequence[currentIndex] = s.m_id;

-                ChunkPtr secondaryChunk;
-                auto it = secondaryChunks.find(s.m_chunkId);
-                if (it == secondaryChunks.end())
+                ChunkPtr secondaryChunk = chunkTable[s.m_chunkId].lock();
+                if (!secondaryChunk)
                {
                    secondaryChunk = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);
-                    secondaryChunks.insert(make_pair(s.m_chunkId, secondaryChunk));
-                }
-                else
-                {
-                    secondaryChunk = it->second;
+                    chunkTable[s.m_chunkId] = secondaryChunk;
                }

                m_innerChunks[currentIndex] = secondaryChunk;
--- a/Source/Readers/ReaderLib/Bundler.h
+++ b/Source/Readers/ReaderLib/Bundler.h
@ -59,6 +59,10 @@ private:
    // (i.e. often in speech)
    bool m_takePrimarySequenceLength;

+    // A table of loaded chunks to make sure we do not load same chunk twice.
+    // Inner vector is the table of chunk id into weak pointer, the outer vector has an element per deserializer.
+    std::vector<std::vector<std::weak_ptr<Chunk>>> m_weakChunkTable;
+
    // General configuration
    int m_verbosity;
 };
--- a/Source/Readers/ReaderLib/ChunkCache.h
+++ b/Source/Readers/ReaderLib/ChunkCache.h
@ -37,9 +37,9 @@ public:
        return m_deserializer->GetSequencesForChunk(chunkId, descriptions);
    }

-    virtual bool GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& description) override
+    virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& description) override
    {
-        return m_deserializer->GetSequenceDescriptionByKey(key, description);
+        return m_deserializer->GetSequenceDescription(primary, description);
    }

    // Gets chunk data given its id.
--- a/Source/Readers/ReaderLib/DataDeserializer.h
+++ b/Source/Readers/ReaderLib/DataDeserializer.h
@ -161,11 +161,11 @@ public:
    // Gets sequence descriptions for a given a chunk.
    virtual void GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDescription>& descriptions) = 0;

-    // Gets sequence description by its key.
-    // Used by deserializers not in driving/primary mode.
-    // Returns false if provided sequence is not valid.
+    // Gets sequence description given the sequence description of the primary deserializer.
+    // Used for deserializers not in driving/primary mode.
+    // Returns false if the corresponding secondary sequence is not valid.
    // TODO: Possibly move this out into a separate interface.
-    virtual bool GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& description) = 0;
+    virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& description) = 0;

    // Gets chunk data given its id.
    virtual ChunkPtr GetChunk(ChunkIdType chunkId) = 0;
--- a/Source/Readers/ReaderLib/DataDeserializerBase.h
+++ b/Source/Readers/ReaderLib/DataDeserializerBase.h
@ -17,9 +17,9 @@ public:
    DataDeserializerBase()
    {}

-    virtual bool GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override
+    virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& result) override
    {
-        NOT_IMPLEMENTED;
+        return GetSequenceDescriptionByKey(primary.m_key, result);
    }

    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override
@ -28,6 +28,11 @@ public:
    }

 protected:
+    virtual bool GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&)
+    {
+        NOT_IMPLEMENTED;
+    }
+
    // Streams this data deserializer can produce.
    std::vector<StreamDescriptionPtr> m_streams;

--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -900,6 +900,14 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
    bool noMoreSamplesToProcess = false;
    for (;;)
    {
+        // Per-minibatch performance measurements; only enabled when perfTraceLevel > 0
+        Timer fineGrainedPerfMeasurementTimer;
+        double readTime = 0;
+        double computeTime = 0;
+        double parameterUpdateTime = 0;
+        if (m_perfTraceLevel > 0)
+            fineGrainedPerfMeasurementTimer.Start();
+
        // get minibatch
        // TODO: is it guaranteed that the GPU is already completed at this point, is it safe to overwrite the buffers?
        size_t actualMBSize = 0;
@ -908,6 +916,13 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        if (!wasDataRead && (!useDistributedMBReading || noMoreSamplesToProcess)) // in case of distributed reading, we do a few more loops until all ranks have completed
            break;                                                                // end of epoch

+        if (m_perfTraceLevel > 0)
+        {
+            fineGrainedPerfMeasurementTimer.Stop();
+            readTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
+            fineGrainedPerfMeasurementTimer.Start();
+        }
+
        // Note: If !wasDataRead then the data that GetMinibatchIntoNetwork() was supposed to full in are undefined.
        // Must not touch them.

@ -998,6 +1013,15 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                smbDispatcher.DoneWithCurrentMinibatch();
        } // if (actualMBSize > 0)

+        if (m_perfTraceLevel > 0)
+        {
+            std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(net->GetDeviceId()));
+            mainStreamSyncEvent->SynchronizeEvent();
+            fineGrainedPerfMeasurementTimer.Stop();
+            computeTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
+            fineGrainedPerfMeasurementTimer.Start();
+        }
+
        // for momentum/clipping/regularization/etc., as well as for progress and statistics, we should only count frames that are not gaps
        // #samples according to the default dynamic axis, for use with criterion nodes that do not have an MBLayout
        size_t numSamplesWithLabelOfNetwork = wasDataRead ? net->GetNumSamplesWithLabelOfNetwork(actualMBSize) : 0;
@ -1105,6 +1129,17 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
            }
        }

+        if (m_perfTraceLevel > 0)
+        {
+            std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(net->GetDeviceId()));
+            mainStreamSyncEvent->SynchronizeEvent();
+            fineGrainedPerfMeasurementTimer.Stop();
+            parameterUpdateTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
+
+            PREPENDTS(stderr);
+            fprintf(stderr, "Perf trace: Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs\n", readTime, computeTime, parameterUpdateTime);
+        }
+
        commTimer.Start();
        // aggregation by model averaging or block momentum 
        if (useModelAggregation)
@ -1131,7 +1166,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
            if (useDistributedMBReading)
            {
                noMoreSamplesToProcess = !wasDataRead;
-            }
+        }

            if (nSamplesSinceLastModelSync >= m_nFramesBetweenASGDSync[epochNumber])
            {
@ -2629,6 +2664,8 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
    // BUGBUG: these are not passed to Init()
    m_doUnitTest = configSGD(L"unitTest", false);

+    m_perfTraceLevel = configSGD(L"perfTraceLevel", (int)0);
+
    // parallel training
    m_parallelizationMethod = ParallelizationMethod::none;
    m_numGradientBits = 32;
@ -2650,27 +2687,27 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
        else
        {
            size_t numMPIWorkers = pMPI->NumNodesInUse();            
-        const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
-        m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
+            const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
+            m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
            m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int)1) - 1; // Epoch numbers internally are 0 based
-        m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
+            m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
            m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int)0);

-        if (configParallelTrain.Exists(L"DataParallelSGD"))
-        {
-            const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
-            size_t defaultGradientBits = 8 * sizeofElemType;
-            m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
-            m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
-            m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
-                if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
+            if (configParallelTrain.Exists(L"DataParallelSGD"))
            {
-                InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
+                const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
+                size_t defaultGradientBits = 8 * sizeofElemType;
+                m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
+                m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
+                m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
+                if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
+                {
+                    InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
+                }
            }
-        }
-        if (configParallelTrain.Exists(L"ModelAveragingSGD"))
-        {
-            const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
+            if (configParallelTrain.Exists(L"ModelAveragingSGD"))
+            {
+                const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
                if (configMASGD.Exists(L"blockSizePerWorker") && configMASGD.Exists(L"blockSize"))
                {
                    InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
@ -2689,8 +2726,8 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
                    m_modelAggregationBlockSize = 40000 * numMPIWorkers;    // default value 
                }
 #if 1  // legacy option 
-            if (configMASGD.Exists(L"syncFrequencyInFrames"))
-            {
+                if (configMASGD.Exists(L"syncFrequencyInFrames"))
+                {
                    if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
                        InvalidArgument("syncFrequencyInFrames is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
                    m_modelAggregationBlockSize = configMASGD(L"syncFrequencyInFrames");
@ -2706,15 +2743,15 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
                    m_modelAggregationBlockSize = configMASGD(L"syncPeriod");
                    m_modelAggregationBlockSize *= numMPIWorkers;
                    fprintf(stderr, "WARNING: option syncPeroid in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
-            }
+                }
 #endif
-        }
-        if (configParallelTrain.Exists(L"BlockMomentumSGD"))
-        {
+            }
+            if (configParallelTrain.Exists(L"BlockMomentumSGD"))
+            {
 #ifndef CNTK_PARALLEL_TRAINING_SUPPORT
-            InvalidArgument("BlockMomentumSGD is not enabled in this version.\n"); 
+                InvalidArgument("BlockMomentumSGD is not enabled in this version.\n"); 
 #else
-            const ConfigRecordType& configBMSGD(configParallelTrain(L"BlockMomentumSGD", ConfigRecordType::Record()));
+                const ConfigRecordType& configBMSGD(configParallelTrain(L"BlockMomentumSGD", ConfigRecordType::Record()));
                if (configBMSGD.Exists(L"blockSize") && configBMSGD.Exists(L"blockSizePerWorker"))
                {
                    InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
@ -2744,33 +2781,33 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
                    fprintf(stderr, "WARNING: option syncPeroid in BlockMomentumSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
                }
 #endif 
-            m_resetSGDMomentum = configBMSGD(L"resetSGDMomentum", true);
-            m_useNesterovBlockMomentum = configBMSGD(L"useNesterovMomentum", true);
-            m_blockLearningRate = configBMSGD(L"blockLearningRate", 1.0); 
+                m_resetSGDMomentum = configBMSGD(L"resetSGDMomentum", true);
+                m_useNesterovBlockMomentum = configBMSGD(L"useNesterovMomentum", true);
+                m_blockLearningRate = configBMSGD(L"blockLearningRate", 1.0);

-            if (configBMSGD.Exists(L"blockMomentumPerSync") && configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
-            {
-                InvalidArgument("It is only allowed to set either blockMomentumPerSync or blockMomentumAsTimeConstant, not both of them");
-            }
-            else if (configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
-            {
-                m_blockMomentumAsTimeConstant = configBMSGD(L"blockMomentumAsTimeConstant"); 
-            }
+                if (configBMSGD.Exists(L"blockMomentumPerSync") && configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
+                {
+                    InvalidArgument("It is only allowed to set either blockMomentumPerSync or blockMomentumAsTimeConstant, not both of them");
+                }
+                else if (configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
+                {
+                    m_blockMomentumAsTimeConstant = configBMSGD(L"blockMomentumAsTimeConstant");
+                }
 #if 1       // This option "blockMomentumPerSync" is going to be deprecated in the future 
-            else if (configBMSGD.Exists(L"blockMomentumPerSync"))
-            {
-                double blockMomentum = configBMSGD(L"blockMomentumPerSync");
+                else if (configBMSGD.Exists(L"blockMomentumPerSync"))
+                {
+                    double blockMomentum = configBMSGD(L"blockMomentumPerSync");
                    m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
-            }
+                }
 #endif 
-            else /*if (!configBMSGD.Exists(L"blockMomentumPerSync") && !configBMSGD.Exists(L"blockMomentumAsTimeConstant"))*/
-            {
+                else /*if (!configBMSGD.Exists(L"blockMomentumPerSync") && !configBMSGD.Exists(L"blockMomentumAsTimeConstant"))*/
+                {
                    double blockMomentum = 1.0 - 1.0 / (double)numMPIWorkers;   // this is a default value which ensures each block update contributes equally
                    m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
-            }
+                }
 #endif 
        }
-
+                
        if (configParallelTrain.Exists(L"DataParallelASGD"))
        {
            const ConfigRecordType & configDataParallelASGD(configParallelTrain(L"DataParallelASGD", ConfigRecordType::Record()));
@ -2784,7 +2821,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
                m_adjustcoefficient = configAdjustLearningRateAtBeginning(L"adjustCoefficient", (double)0.1);
                m_adjustnbminibatch = configAdjustLearningRateAtBeginning(L"adjustNbMinibatch", (size_t)256);
            }
-        }
+            }
        } // if (!pMPI)
    } // if (configSGD.Exists(L"ParallelTrain"))
 }
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -253,6 +253,8 @@ protected:

    bool m_useAllDataForPreComputedNode;

+    int m_perfTraceLevel;
+
    // Parallel training
    MPIWrapperPtr m_mpi;

--- a/Tests/EndToEndTests/MetricsDriver.py
+++ b/Tests/EndToEndTests/MetricsDriver.py
@ -0,0 +1,189 @@
+#!/usr/bin/env python
+# ----------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+# ---------------------------------------------------------
+# This script extracts information (hardware used, final results) contained in the baselines files
+# and generates a markdown file (wiki page)
+
+import sys, os, re
+import TestDriver as td
+
+try:
+  import six
+except ImportError:
+  print("Python package 'six' not installed. Please run 'pip install six'.")
+  sys.exit(1)
+
+thisDir = os.path.dirname(os.path.realpath(__file__))
+windows = os.getenv("OS")=="Windows_NT"
+
+class Baseline:
+  def __init__(self, fullPath, testResult = "", trainResult = ""):
+    self.fullPath = fullPath
+    self.cpuInfo = ""
+    self.gpuInfo = ""
+    self.testResult = testResult
+    self.trainResult = trainResult
+
+  # extracts results info. e.g.
+  # Finished Epoch[ 5 of 5]: [Training] ce = 2.32253198 * 1000 err = 0.90000000 * 1000 totalSamplesSeen = 5000 learningRatePerSample = 2e-06 epochTime=0.175781
+  # Final Results: Minibatch[1-1]: err = 0.90000000 * 100 ce = 2.32170486 * 100 perplexity = 10.1930372
+  def extractResultsInfo(self, baselineContent):
+    trainResults = re.findall('.*(Finished Epoch\[ *\d+ of \d+\]\: \[Training\]) (.*)', baselineContent)
+    if trainResults:                                       
+      self.trainResult = Baseline.formatLastTrainResult(trainResults[-1])[0:-2]
+    testResults = re.findall('.*(Final Results: Minibatch\[1-\d+\]:)(\s+\* \d+)?\s+(.*)', baselineContent)
+    if testResults:
+      self.testResult = Baseline.formatLastTestResult(testResults[-1])[0:-2]
+
+  # extracts cpu and gpu info from baseline content. e.g.:
+  #CPU info:
+  #  CPU Model Name: Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz
+  #  Hardware threads: 12
+  #GPU info:
+  #
+  #Device[0]: cores = 2496; computeCapability = 5.2; type = "Quadro M4000"; memory = 8192 MB
+  #Device[1]: cores = 96; computeCapability = 2.1; type = "Quadro 600"; memory = 1024 MB
+  #  Total Memory: 33474872 kB
+  def extractHardwareInfo(self, baselineContent):
+    startCpuInfoIndex = baselineContent.find("CPU info:")
+    endCpuInfoIndex = baselineContent.find("----------", startCpuInfoIndex)
+    cpuInfo = re.search("^CPU info:\s+"
+                       "CPU Model (Name:\s*.*)\s+"                        
+                       "(Hardware threads: \d+)\s+"
+                       "Total (Memory:\s*.*)\s+", baselineContent[startCpuInfoIndex:endCpuInfoIndex], re.MULTILINE)
+    if cpuInfo is None:
+      return
+    self.cpuInfo = "\n".join(cpuInfo.groups())
+
+    startGpuInfoIndex = baselineContent.find("GPU info:")
+    endGpuInfoIndex = baselineContent.find("----------", startGpuInfoIndex)
+    gpuInfoSnippet = baselineContent[startGpuInfoIndex:endGpuInfoIndex]
+
+    gpuDevices = re.findall("\t\t(Device\[\d+\]: cores = \d+; computeCapability = \d\.\d; type = .*; memory = \d+ MB)[\r\n]?", gpuInfoSnippet)
+    if not gpuDevices:
+      return
+    gpuInfo = [ device for device in gpuDevices ]
+    self.gpuInfo = "\n".join(gpuInfo)
+
+  @staticmethod
+  def formatLastTestResult(line):
+    return line[0] + line[1] + "\n" + line[2].replace('; ', '\n').replace('    ','\n')
+
+  @staticmethod
+  def formatLastTrainResult(line):
+    epochsInfo, parameters = line[0], line[1]
+    return epochsInfo + '\n' + parameters.replace('; ', '\n')
+
+class Example:
+
+  allExamplesIndexedByFullName = {} 
+
+  def __init__(self, suite, name, testDir):
+    self.suite = suite
+    self.name = name
+    self.fullName = suite + "/" + name
+    self.testDir = testDir
+    self.baselineList = []
+    
+    self.gitHash = ""
+
+  @staticmethod
+  def discoverAllExamples():
+    testsDir = thisDir
+    for dirName, subdirList, fileList in os.walk(testsDir):
+      if 'testcases.yml' in fileList:
+        testDir = dirName
+        exampleName = os.path.basename(dirName)
+        suiteDir = os.path.dirname(dirName)
+        # suite name will be derived from the path components
+        suiteName = os.path.relpath(suiteDir, testsDir).replace('\\', '/')                    
+
+        example = Example(suiteName,  exampleName, testDir)
+        Example.allExamplesIndexedByFullName[example.fullName.lower()] = example
+
+  # it returns a list with all baseline files for current example
+  def findBaselineFilesList(self):
+    baselineFilesList = []
+
+    oses = [".windows", ".linux", ""]
+    devices = [".cpu", ".gpu", ""]
+    flavors = [".debug", ".release", ""]
+
+    for o in oses:
+      for device in devices:
+        for flavor in flavors:          
+          candidateName = "baseline" + o + flavor + device + ".txt"
+          fullPath = td.cygpath(os.path.join(self.testDir, candidateName), relative=True)          
+          if os.path.isfile(fullPath):
+            baseline = Baseline(fullPath);
+            baselineFilesList.append(baseline)
+
+    return baselineFilesList
+
+# extracts information for every example and stores it in Example.allExamplesIndexedByFullName
+def getExamplesMetrics():  
+  Example.allExamplesIndexedByFullName = list(sorted(Example.allExamplesIndexedByFullName.values(), key=lambda test: test.fullName))  
+  allExamples = Example.allExamplesIndexedByFullName
+
+  print ("CNTK - Metrics collector")  
+
+  for example in allExamples:
+    baselineListForExample = example.findBaselineFilesList() 
+    six.print_("Example: " + example.fullName)   
+    for baseline in baselineListForExample:        
+      with open(baseline.fullPath, "r") as f:
+        baselineContent = f.read()
+        gitHash = re.search('.*Build SHA1:\s([a-z0-9]{40})[\r\n]+', baselineContent, re.MULTILINE)
+        if gitHash is None:
+          continue
+        example.gitHash = gitHash.group(1) 
+        baseline.extractHardwareInfo(baselineContent)
+        baseline.extractResultsInfo(baselineContent)
+      example.baselineList.append(baseline)    
+        
+# creates a list with links to each example result
+def createAsciidocExampleList(file):
+  for example in Example.allExamplesIndexedByFullName:
+    if not example.baselineList:
+      continue
+    file.write("".join(["<<", example.fullName.replace("/","").lower(),",", example.fullName, ">> +\n"]))
+  file.write("\n")
+
+def writeMetricsToAsciidoc():
+  metricsFile = open("metrics.adoc",'wb')
+
+  createAsciidocExampleList(metricsFile)
+
+  for example in Example.allExamplesIndexedByFullName:
+    if not example.baselineList:
+      continue
+    metricsFile.write("".join(["===== ", example.fullName, "\n"]))
+    metricsFile.write("".join(["**Git Hash: **", example.gitHash, "\n\n"]))
+    metricsFile.write("[cols=3, options=\"header\"]\n")
+    metricsFile.write("|====\n")
+    metricsFile.write("|Log file / Configuration | Train Result | Test Result\n")
+    for baseline in example.baselineList:
+      pathInDir=baseline.fullPath.split(thisDir)[1][1:]
+      metricsFile.write("".join(["|link:../blob/", example.gitHash[:7],"/Tests/EndToEndTests/", pathInDir, "[",
+                                 baseline.fullPath.split("/")[-1], "] .2+|", baseline.trainResult.replace("\n", " "), " .2+|",
+                                 baseline.testResult.replace("\n", " "), "|\n"]))
+      cpuInfo = "".join(["CPU: ", re.sub("[\r]?\n", ' ', baseline.cpuInfo)])
+
+      gpuInfo = re.sub("[\r]?\n", ' ', baseline.gpuInfo)
+      if gpuInfo:
+        metricsFile.write("".join([cpuInfo, " GPU: ", gpuInfo]))
+      else:
+        metricsFile.write(cpuInfo)
+
+    metricsFile.write("\n|====\n\n")
+
+# ======================= Entry point =======================
+six.print_("==============================================================================")
+
+Example.discoverAllExamples()
+
+getExamplesMetrics()
+
+writeMetricsToAsciidoc()
--- a/Tests/EndToEndTests/TestDriver.py
+++ b/Tests/EndToEndTests/TestDriver.py
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python
 # ----------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # ---------------------------------------------------------
@ -687,89 +687,90 @@ def runCommand(args):
    sys.exit(10)

 # ======================= Entry point =======================
-parser = argparse.ArgumentParser(description="TestDriver - CNTK Test Driver")
-subparsers = parser.add_subparsers(help="command to execute. Run TestDriver.py <command> --help for command-specific help")
-runSubparser = subparsers.add_parser("run", help="run test(s)")
-runSubparser.add_argument("test", nargs="*",
-                    help="optional test name(s) to run, specified as Suite/TestName. "
-                         "Use list command to list available tests. "
-                         "If not specified then all tests will be run.")
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="TestDriver - CNTK Test Driver")
+  subparsers = parser.add_subparsers(help="command to execute. Run TestDriver.py <command> --help for command-specific help")
+  runSubparser = subparsers.add_parser("run", help="run test(s)")
+  runSubparser.add_argument("test", nargs="*",
+                      help="optional test name(s) to run, specified as Suite/TestName. "
+                           "Use list command to list available tests. "
+                           "If not specified then all tests will be run.")

-defaultBuildSKU = "gpu"
+  defaultBuildSKU = "gpu"

-runSubparser.add_argument("-b", "--build-location", help="location of the CNTK build to run")
-runSubparser.add_argument("-t", "--tag", help="runs tests which match the specified tag")
-runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specified device")
-runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specified flavor")
-runSubparser.add_argument("-s", "--build-sku", default=defaultBuildSKU, help="cpu|gpu|1bitsgd - run tests only for a specified build SKU")
-tmpDir = os.getenv("TEMP") if windows else "/tmp"
-defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000)))
-runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp")
-runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them")
-runSubparser.add_argument("--create-baseline", action='store_true', help="create new baseline file(s) (named as baseline.<os>.<device>.txt) for tests that do not currently have baselines")
-runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script")
-runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and configurations to be run along with full command lines")
+  runSubparser.add_argument("-b", "--build-location", help="location of the CNTK build to run")
+  runSubparser.add_argument("-t", "--tag", help="runs tests which match the specified tag")
+  runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specified device")
+  runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specified flavor")
+  runSubparser.add_argument("-s", "--build-sku", default=defaultBuildSKU, help="cpu|gpu|1bitsgd - run tests only for a specified build SKU")
+  tmpDir = os.getenv("TEMP") if windows else "/tmp"
+  defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000)))
+  runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp")
+  runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them")
+  runSubparser.add_argument("--create-baseline", action='store_true', help="create new baseline file(s) (named as baseline.<os>.<device>.txt) for tests that do not currently have baselines")
+  runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script")
+  runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and configurations to be run along with full command lines")

-runSubparser.set_defaults(func=runCommand)
+  runSubparser.set_defaults(func=runCommand)

-listSubparser = subparsers.add_parser("list", help="list available tests")
-listSubparser.add_argument("-t", "--tag", help="limits a resulting list to tests matching the specified tag")
-listSubparser.add_argument("-d", "--device", help="cpu|gpu - tests for a specified device")
-listSubparser.add_argument("-f", "--flavor", help="release|debug - tests for specified flavor")
-listSubparser.add_argument("-s", "--build-sku", default=defaultBuildSKU, help="cpu|gpu|1bitsgd - list tests only for a specified build SKU")
-listSubparser.add_argument("--os", help="windows|linux - tests for a specified operating system")
+  listSubparser = subparsers.add_parser("list", help="list available tests")
+  listSubparser.add_argument("-t", "--tag", help="limits a resulting list to tests matching the specified tag")
+  listSubparser.add_argument("-d", "--device", help="cpu|gpu - tests for a specified device")
+  listSubparser.add_argument("-f", "--flavor", help="release|debug - tests for specified flavor")
+  listSubparser.add_argument("-s", "--build-sku", default=defaultBuildSKU, help="cpu|gpu|1bitsgd - list tests only for a specified build SKU")
+  listSubparser.add_argument("--os", help="windows|linux - tests for a specified operating system")

-listSubparser.set_defaults(func=listCommand)
+  listSubparser.set_defaults(func=listCommand)

-if len(sys.argv)==1:
-    parser.print_help()
-    sys.exit(1)
-
-args = parser.parse_args(sys.argv[1:])
-
-# parsing a --device, --flavor and --os options:
-args.devices = ["cpu", "gpu"]
-if (args.device):
-  args.device = args.device.lower()
-  if not args.device in args.devices:
-    six.print_("--device must be one of", args.devices, file=sys.stderr)
-    sys.exit(1)
-  args.devices = [args.device]
-
-args.flavors = ["debug", "release"]
-if (args.flavor):
-  args.flavor = args.flavor.lower()
-  if not args.flavor in args.flavors:
-    six.print_("--flavor must be one of", args.flavors, file=sys.stderr)
-    sys.exit(1)
-  args.flavors = [args.flavor]
-
-args.buildSKUs = ["cpu", "gpu", "1bitsgd"]
-if (args.build_sku):
-  args.build_sku = args.build_sku.lower()
-  if not args.build_sku in args.buildSKUs:
-    six.print_("--build-sku must be one of", args.buildSKUs, file=sys.stderr)
-    sys.exit(1)
-  args.buildSKUs = [args.build_sku]
-  if args.build_sku == "cpu" and args.devices == ["gpu"]:
-    print >>sys.stderr, "Invalid combination: --build-sku cpu and --device gpu"
-    sys.exit(1)
-
-if args.func == runCommand and not args.build_location:
-  args.build_location = os.path.realpath(os.path.join(thisDir, "../..", "x64" if windows else "build/"))
-
-if args.func == listCommand:
-  args.oses = ["windows", "linux"]
-  if (args.os):
-    args.os = args.os.lower()
-    if not args.os in args.oses:
-      six.print_("--os must be one of", args.oses, file=sys.stderr)
+  if len(sys.argv)==1:
+      parser.print_help()
      sys.exit(1)
-  args.oses = [args.os]

-# discover all the tests
-Test.discoverAllTests()
+  args = parser.parse_args(sys.argv[1:])

-# execute the command
-args.func(args)
+  # parsing a --device, --flavor and --os options:
+  args.devices = ["cpu", "gpu"]
+  if (args.device):
+    args.device = args.device.lower()
+    if not args.device in args.devices:
+      six.print_("--device must be one of", args.devices, file=sys.stderr)
+      sys.exit(1)
+    args.devices = [args.device]
+
+  args.flavors = ["debug", "release"]
+  if (args.flavor):
+    args.flavor = args.flavor.lower()
+    if not args.flavor in args.flavors:
+      six.print_("--flavor must be one of", args.flavors, file=sys.stderr)
+      sys.exit(1)
+    args.flavors = [args.flavor]
+
+  args.buildSKUs = ["cpu", "gpu", "1bitsgd"]
+  if (args.build_sku):
+    args.build_sku = args.build_sku.lower()
+    if not args.build_sku in args.buildSKUs:
+      six.print_("--build-sku must be one of", args.buildSKUs, file=sys.stderr)
+      sys.exit(1)
+    args.buildSKUs = [args.build_sku]
+    if args.build_sku == "cpu" and args.devices == ["gpu"]:
+      print >>sys.stderr, "Invalid combination: --build-sku cpu and --device gpu"
+      sys.exit(1)
+
+  if args.func == runCommand and not args.build_location:
+    args.build_location = os.path.realpath(os.path.join(thisDir, "../..", "x64" if windows else "build/"))
+
+  if args.func == listCommand:
+    args.oses = ["windows", "linux"]
+    if (args.os):
+      args.os = args.os.lower()
+      if not args.os in args.oses:
+        six.print_("--os must be one of", args.oses, file=sys.stderr)
+        sys.exit(1)
+    args.oses = [args.os]
+
+  # discover all the tests
+  Test.discoverAllTests()
+
+  # execute the command
+  args.func(args)

--- a/Tests/EndToEndTests/run-test-common
+++ b/Tests/EndToEndTests/run-test-common
@ -1,5 +1,8 @@
 #!/bin/bash
-
+# ----------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+# ----------------------------------------------------------
 # Helper script containing common code used by run-test scripts of E2E tests

 BinaryPath=$TEST_CNTK_BINARY
@ -26,6 +29,21 @@ MPIArgs=
 DeleteExistingModels=1
 DeleteModelsAfterTest=1

+# Print info needed by MetricsDriver.py to extract tests metrics
+printHardwareInfo()
+{
+  cpuName=$(cat /proc/cpuinfo 2> /dev/null | grep -m 1 'model name' | cut -d : -f 2- | tr -s " " | cut -c 2-)
+  totalMemory=$(cat /proc/meminfo 2> /dev/null | grep 'MemTotal' | cut -d : -f 2- | tr -s " " | cut -c 2-)
+  nproc=$(nproc)
+
+  # Note that MetricsDriver.py depends on this format
+  echo "CPU info:"
+  echo "    CPU Model Name: $cpuName"
+  echo "    Hardware threads: $nproc"
+  echo "    Total Memory: $totalMemory"
+  echo "-------------------------------------------------------------------"
+}
+
 # Helper function to print and run a command
 run()
 {
@ -119,3 +137,6 @@ cntkmpirun()
  cntkrun "$2" "$3"
  return $?
 }
+
+# place printHardwareInfo here, so that all tests print it
+printHardwareInfo
--- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
+++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
@ -5,6 +5,8 @@

 #include "stdafx.h"
 #include "EvalTestHelper.h"
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>

 using namespace Microsoft::MSR::CNTK;

@ -21,22 +23,10 @@ BOOST_FIXTURE_TEST_SUITE(EvalTestSuite, EvalFixture)

 IEvaluateModelExtended<float>* SetupNetworkAndGetLayouts(std::string modelDefinition, VariableSchema& inputLayouts, VariableSchema& outputLayouts)
 {
-    // Load the eval library
-    auto hModule = LoadLibrary(L"evaldll.dll");
-    if (hModule == nullptr)
-    {
-        auto err = GetLastError();
-        throw std::exception((boost::format("Cannot load evaldll.dll: 0x%08lx") % err).str().c_str());
-    }
-
-    // Get the factory method to the evaluation engine
-    std::string func = "GetEvalExtendedF";
-    auto procAddress = GetProcAddress(hModule, func.c_str());
-    auto getEvalProc = (GetEvalProc<float>)procAddress;
-
    // Native model evaluation instance
    IEvaluateModelExtended<float> *eval;
-    getEvalProc(&eval);
+
+    GetEvalExtendedF(&eval);

    try
    {
@ -44,7 +34,7 @@ IEvaluateModelExtended<float>* SetupNetworkAndGetLayouts(std::string modelDefini
    }
    catch (std::exception& ex)
    {
-        fprintf(stderr, ex.what());
+        fprintf(stderr, "%s\n", ex.what());
        throw;
    }
    fflush(stderr);
@ -53,9 +43,9 @@ IEvaluateModelExtended<float>* SetupNetworkAndGetLayouts(std::string modelDefini
    outputLayouts = eval->GetOutputSchema();

    for (auto vl : outputLayouts)
-    {
-        fprintf(stderr, "Output dimension: %d\n", vl.m_numElements);
-        fprintf(stderr, "Output name: %ls\n", vl.m_name);
+    {        
+        fprintf(stderr, "Output dimension: %" PRIu64 "\n", vl.m_numElements);
+        fprintf(stderr, "Output name: %ls\n", vl.m_name.c_str());
    }

    eval->StartForwardEvaluation({outputLayouts[0].m_name});
--- a/Tests/UnitTests/EvalTests/EvalTestHelper.h
+++ b/Tests/UnitTests/EvalTests/EvalTestHelper.h
@ -63,13 +63,13 @@ struct EvalFixture
                    if (!envVariableErrorMessage.empty())
                    {
                        BOOST_TEST_MESSAGE(envVariableErrorMessage);
-                        fprintf(stderr, envVariableErrorMessage.c_str());
+                        fprintf(stderr, "%s\n", envVariableErrorMessage.c_str());
                    }

                    newCurrentPath = m_testDataPath;
                }
            }
-            else if ((subPath[0] == '/' && subPath[1] == '//') || (subPath[0] == '\\' && subPath[1] == '\\'))
+            else if ((subPath[0] == '/' && subPath[1] == '/') || (subPath[0] == '\\' && subPath[1] == '\\'))
            {
                newCurrentPath = subPath;
            }
--- a/Tests/UnitTests/EvalTests/EvalTests.vcxproj
+++ b/Tests/UnitTests/EvalTests/EvalTests.vcxproj
@ -72,6 +72,7 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)..;$(BOOST_LIB_PATH)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDll.lib; %(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
@ -86,8 +87,7 @@
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+      <GenerateDebugInformation>true</GenerateDebugInformation>      
    </Link>
    <CudaCompile>
      <TargetMachinePlatform>64</TargetMachinePlatform>
@ -112,8 +112,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <PreprocessorDefinitions>WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+      <PreprocessorDefinitions>WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>      
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
@ -168,4 +167,4 @@
      <Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
    </Copy>
  </Target>
-</Project>
+</Project>
--- a/Tests/UnitTests/EvalTests/stdafx.h
+++ b/Tests/UnitTests/EvalTests/stdafx.h
@ -9,16 +9,19 @@
 #define _SCL_SECURE_NO_WARNINGS // current API of matrix does not allow safe invokations. TODO: change api to proper one.

 #ifdef _WIN32
-    #include "targetver.h"
+#include "targetver.h"
+#include <windows.h>
 #endif

 #include <stdio.h>
-#include <windows.h>
-

 // TODO: reference additional headers your program requires here
 #include "Eval.h"

 //Adding required boost header
+#ifndef _WIN32
+// Use dynamic library on Linux
+#define BOOST_TEST_DYN_LINK
+#endif
 #include <boost/test/unit_test.hpp>
 #include <boost/format.hpp>
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
@ -5,171 +5,20 @@
 // MathPerformanceTests.cpp : Defines the entry point for the console application.
 //
 #include "stdafx.h"
-#define NOMINMAX
-#include "Windows.h"
+//#define NOMINMAX
+//#include "Windows.h"
+#include "Matrix.h"
+#include "CPUMatrix.h"
+#include "TensorView.h"
+#include "Sequences.h"
 #include <chrono>
 #include <iostream>
 #include <vector>
-#include "Matrix.h"
-#include "CPUMatrix.h"
-#include "Sequences.h"
+#include <algorithm>
+
 using namespace Microsoft::MSR::CNTK;
 using namespace std;

-template <class ElemType>
-void SetToInitStateValueForResetSeg(const Matrix<ElemType>& sentenceBegin,
-                                    size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
-{
-    Matrix<ElemType> colSeg(sentenceBegin.GetDeviceId());
-    colSeg.Resize(nStream, nStream);
-    size_t nStateRow = newprevstate.GetNumRows();
-
-    assert(nStream == sentenceBegin.GetNumRows());
-
-    // only set state to init state value for segmentation = 0, and -1
-    // e.g., -1 0 1 -> 0 0 1 -> 0 0 -1 -> 1 1 0
-
-    Matrix<ElemType> colPos(sentenceBegin.GetDeviceId());
-    colPos.SetValue(sentenceBegin);                                                     // -1 0 1
-    colPos.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
-    Matrix<ElemType>::Scale((ElemType) -1.0, colPos);
-    colPos += 0; // (int)MinibatchPackingFlags::None; // TODO: these flags no longer exist, this test probably no longer applies
-    colSeg.SetDiagonalValue(colPos);
-    Matrix<ElemType> ones(sentenceBegin.GetDeviceId());
-    ones.Resize(nStateRow, nStream);
-    ones.SetValue((ElemType) 1);
-    // add default state value if it is for reset
-    Matrix<ElemType>::MultiplyAndWeightedAdd(initStateValue, ones, false, colSeg, false, 1.0, newprevstate); // += [0 initStateValue 0 ]
-}
-
-template <class ElemType>
-void rnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& colBegin, const Matrix<ElemType>& needToCompute)
-{
-    size_t ncol = functionValues.GetNumCols();
-    size_t ntime = ncol / mNbr;
-    Matrix<ElemType> out = functionValues.ColumnSlice(0, mNbr);
-    Matrix<ElemType> inp((DEVICEID_TYPE) functionValues.GetDeviceId());
-
-    for (size_t d = 0; d < ntime; d++)
-    {
-        if (d == 0)
-            inp = pastActivity.ColumnSlice(d, mNbr);
-        else
-            inp = inputFunctionValues.ColumnSlice(d, mNbr);
-
-        if (needToCompute.ColumnSlice(d, 1).Get00Element() == 1)
-        {
-            Matrix<ElemType> colSegPastActivity((DEVICEID_TYPE) functionValues.GetDeviceId());
-            Matrix<ElemType> colSeg((DEVICEID_TYPE) functionValues.GetDeviceId());
-            colSeg.Resize(mNbr, mNbr);
-            colSeg.SetValue(0);
-            colSegPastActivity.SetValue(colBegin);
-            colSegPastActivity.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
-            colSeg.SetDiagonalValue(colSegPastActivity);
-            Matrix<ElemType>::Multiply(inp, false, colSeg, false, out);
-            ElemType initStateValue = (ElemType) 0.1;
-            SetToInitStateValueForResetSeg<ElemType>(colBegin, mNbr, initStateValue, out);
-        }
-    }
-}
-
-template <class ElemType>
-void oldRnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues)
-{
-    size_t ncol = functionValues.GetNumCols();
-    size_t ntime = ncol / mNbr;
-    for (size_t timeIdxInSeq = 0; timeIdxInSeq < ntime; timeIdxInSeq++)
-    {
-        for (size_t i = 0; i < mNbr; i++)
-        {
-            bool reset = false;
-
-            if (timeIdxInSeq == 0)
-            {
-                reset = true;
-            }
-            oldRNNForwardPropSRP<ElemType>(timeIdxInSeq, 1, reset, (ElemType) 0.1, functionValues, pastActivity, inputFunctionValues, i, mNbr);
-        }
-    }
-}
-
-template <class ElemType>
-void oldRNNForwardPropSRP(const size_t timeIdxInSeq, const int delay, const bool reset, const ElemType default_activity, Matrix<ElemType>& functionValues, const Matrix<ElemType>& pastActivity, const Matrix<ElemType>& inputFunctionValues, const size_t indexInBatch, const size_t mNbr)
-{
-    assert(delay > 0);
-
-    if (functionValues.GetNumRows() != inputFunctionValues.GetNumRows() ||
-        functionValues.GetNumCols() != inputFunctionValues.GetNumCols())
-        functionValues.Resize(inputFunctionValues.GetNumRows(),
-                              inputFunctionValues.GetNumCols());
-
-    int iPastIndex = (int) ((int) timeIdxInSeq - (int) delay) * (int) mNbr;
-    int d = iPastIndex;
-    if (d < 0)
-        d = (int) functionValues.Mod((float) iPastIndex, (float) pastActivity.GetNumCols());
-    // this can point to the past activity of the previous mninibatch
-
-    Matrix<ElemType> out = functionValues.ColumnSlice(timeIdxInSeq * mNbr + indexInBatch, 1);
-    Matrix<ElemType> inp((DEVICEID_TYPE) functionValues.GetDeviceId());
-
-    if (reset)
-        out.SetValue(default_activity);
-    else
-    {
-        if (iPastIndex < 0)
-            inp = pastActivity.ColumnSlice(d + indexInBatch, 1);
-        else
-            inp = inputFunctionValues.ColumnSlice(d + indexInBatch, 1);
-        out.AssignValuesOf(inp);
-    }
-}
-
-/**
-The new way of resetting RNN state. 
-*/
-template <class ElemType>
-void TestRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
-{
-    Matrix<ElemType> functionValues(deviceID);
-    Matrix<ElemType> colBegin(deviceID);
-    Matrix<ElemType> pastActivity(deviceID);
-    Matrix<ElemType> inputFunctionValues(deviceID);
-    Matrix<ElemType> needToCompute(deviceID);
-
-    functionValues.Resize(nRow, nCol);
-    colBegin.Resize(mNbr, 1);
-    pastActivity.Resize(nRow, nCol);
-    inputFunctionValues.Resize(nRow, nCol);
-    needToCompute.Resize(1, nCol / mNbr);
-    needToCompute.SetValue(0);
-    needToCompute.ColumnSlice(0, 1).SetValue(1);
-    auto t_start = clock();
-    rnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues, colBegin, needToCompute);
-    auto t_end = clock();
-    std::cout << "testRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
-/**
-The old way of resetting RNN state, which used if statement. Also only supports up to two sentences within a minibatch
-*/
-template <class ElemType>
-void TestOldRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
-{
-    Matrix<ElemType> functionValues(deviceID);
-    Matrix<ElemType> colBegin(deviceID);
-    Matrix<ElemType> pastActivity(deviceID);
-    Matrix<ElemType> inputFunctionValues(deviceID);
-
-    functionValues.Resize(nRow, nCol);
-    colBegin.Resize(mNbr, 1);
-    pastActivity.Resize(nRow, nCol);
-    inputFunctionValues.Resize(nRow, nCol);
-    auto t_start = clock();
-    oldRnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues);
-    auto t_end = clock();
-    std::cout << "TestOldRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
 template <class ElemType>
 void randomInitializeCPUMatrix(CPUMatrix<ElemType>& M, float min = -10, float max = 10)
 {
@ -250,77 +99,6 @@ void AddMultiplyAndInplaceSigmoidTest(int n, int k, int m)
    std::cout << "Matrix in: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
 }

-template <class ElemType>
-void ColumnSliceMultAndAddTest(int n, int k, int m, DEVICEID_TYPE deviceID)
-{
-    cout << "Testing Matrix" << endl;
-
-    Matrix<ElemType> AG((size_t) n, (size_t) k, deviceID);
-    AG.SetUniformRandomValue(-1, 1);
-
-    Matrix<ElemType> BG((size_t) k, (size_t) m, deviceID);
-    BG.SetUniformRandomValue(-1, 1);
-
-    Matrix<ElemType> CG((size_t) n, (size_t) m, deviceID);
-    Matrix<ElemType> DG((size_t) n, (size_t) m, deviceID);
-
-    auto t_startG = clock();
-    Matrix<ElemType>::MultiplyAndAdd(AG, false, BG, false, CG);
-    auto t_endG = clock();
-    std::cout << "MultiplyAndAdd Directly: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With ColumnSlice&: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    Matrix<ElemType> col_BG1(0), col_CG1(0);
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        col_BG1.AssignColumnSlice(BG, i, 1);
-        col_CG1.AssignColumnSlice(CG, i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG1, false, col_CG1);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_DG = DG.ColumnSlice(i, 1);
-        col_DG.AssignSigmoidOf(col_CG);
-    }
-    t_endG = clock();
-    std::cout << "AssignSigmoidOf With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        col_BG1.AssignColumnSlice(BG, i, 1);
-        col_CG1.AssignColumnSlice(CG, i, 1);
-        col_BG1.AssignSigmoidOf(col_CG1);
-    }
-    t_endG = clock();
-    std::cout << "AssignSigmoidOf With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
 template <class ElemType>
 void SquareMultiplyAndAdd10TimesAvgTest(int n, int count)
 {
@ -437,12 +215,6 @@ void MandSTest(int count, int devId)

 int wmain()
 {
-    ColumnSliceMultAndAddTest<float>(2048, 2048, 256, 0);
-
-    TestRnnForwardPropSRP<float>();
-
-    TestOldRnnForwardPropSRP<float>();
-
    // MandSTest<float>(100, 2);

    /*cout<<endl<<"********************Matrix SquareMultiplyAndWeightedAdd10TimesAvg TEST********************"<<endl;
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.vcxproj
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.vcxproj
@ -114,10 +114,11 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="..\..\..\Source\Common\ExceptionWithCallStack.cpp" />
    <ClCompile Include="MathPerformanceTests.cpp" />
    <ClCompile Include="stdafx.cpp">
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-</Project>
+</Project>
--- a/Tests/UnitTests/MathTests/BlockMultiplierTests.cpp
+++ b/Tests/UnitTests/MathTests/BlockMultiplierTests.cpp
@ -8,15 +8,11 @@
 namespace Microsoft { namespace MSR { namespace CNTK { namespace TEST {

 //The simplest possible matrix multiplier, used here as a check.
-template<typename ScalarAT, typename ScalarBT, typename ScalarCT, int MAXRANGE = 1 << ((8 * sizeof(ScalarAT)) - 3)> class ReferenceMultiplier
+template<typename ScalarAT, typename ScalarBT, typename ScalarCT> class ReferenceMultiplier
 {
    public:

-        typedef ScalarAT ScalarAT;
-        typedef ScalarBT ScalarBT;
-        typedef ScalarCT ScalarCT;
-
-        static const int MAXRANGE = MAXRANGE;
+        static const int MAXRANGE = 1 << ((8 * sizeof(ScalarAT)) - 3);

        ScalarBT* PrepareB(ScalarBT* oldB, int k, int n) { return oldB; }
        static ScalarAT* CreateMatrixA(int m, int n)
@ -77,9 +73,17 @@ template<typename ScalarAT, typename ScalarBT, typename ScalarCT, int MAXRANGE =
        }
 };

-    template<typename ScalarAT, typename ScalarBT, typename ScalarCT, typename MultiplierT>static void TestMultiplierSub(
-            int m, int k, int n, MultiplierT& testMult, int numThreads = 1, ScalarCT epsilon = ScalarCT())
+template<typename ScalarCT> void CompareMatricesAndDump(const ScalarCT* ref, const ScalarCT* test,
+        int m, int /*k*/, int n)
+{
+    for (int i = 0; i < m * n; ++i)
+    {
+        BOOST_CHECK_EQUAL(ref[i], test[i]);
+    }
+}

+template<typename ScalarAT, typename ScalarBT, typename ScalarCT, typename MultiplierT>static void TestMultiplierSub(
+            int m, int k, int n, MultiplierT& testMult, int numThreads = 1, ScalarCT epsilon = ScalarCT())
 {
    epsilon;
    testMult.SetNumThreads(numThreads);
@ -126,23 +130,13 @@ template<typename ScalarAT, typename ScalarBT, typename ScalarCT, int MAXRANGE =

 }

-
-    template<typename ScalarAT, typename ScalarBT, typename ScalarCT, typename MultiplierT>static void TestMultiplierSub(
+template<typename ScalarAT, typename ScalarBT, typename ScalarCT, typename MultiplierT>static void TestMultiplierSub(
            int m, int k, int n, int numThreads = 1, ScalarCT epsilon = ScalarCT())
 {
    MultiplierT testMult;
    TestMultiplierSub<ScalarAT, ScalarBT, ScalarCT, MultiplierT>(m, k, n, testMult, numThreads, epsilon);
 }

-template<typename ScalarCT> void CompareMatricesAndDump(const ScalarCT* ref, const ScalarCT* test,
-        int m, int /*k*/, int n)
-{
-    for (int i = 0; i < m * n; ++i)
-    {
-        BOOST_CHECK_EQUAL(ref[i], test[i]);
-    }
-}
-
 BOOST_AUTO_TEST_SUITE(BlockMultiplierSuite)

 BOOST_AUTO_TEST_CASE(BlockMultiplyTest8x128x8SingleThread)
--- a/Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp
@ -3,7 +3,9 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
 #include "stdafx.h"
+#ifdef _WIN32
 #include <crtdefs.h>
+#endif
 #include "../../../Source/Math/CPUSparseMatrix.h"

 using namespace Microsoft::MSR::CNTK;
--- a/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
@ -6,7 +6,9 @@
 //
 #include "stdafx.h"
 #include <math.h>
+#ifdef _WIN32
 #include <crtdefs.h>
+#endif
 #include "../../../Source/Math/GPUSparseMatrix.h"

 using namespace Microsoft::MSR::CNTK;
--- a/Tests/UnitTests/MathTests/MathTests.vcxproj
+++ b/Tests/UnitTests/MathTests/MathTests.vcxproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
@ -66,7 +66,7 @@
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
-      <AdditionalIncludeDirectories>$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\Common\Include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
@ -135,6 +135,7 @@
    <ClInclude Include="fixtures.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
+    <ClInclude Include="TensorTestsHelper.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="BatchNormalizationEngineTests.cpp" />
@ -156,6 +157,7 @@
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="CPUMatrixTests.cpp" />
+    <ClCompile Include="TensorTests.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <Target Name="Build" Condition="$(HasBoost)" Outputs="$(TargetPath)" DependsOnTargets="$(BuildDependsOn)" />
--- a/Tests/UnitTests/MathTests/MatrixBlasTests.cpp
+++ b/Tests/UnitTests/MathTests/MatrixBlasTests.cpp
@ -4,7 +4,9 @@
 //
 #include "stdafx.h"
 #include <math.h>
+#ifdef _WIN32
 #include <crtdefs.h>
+#endif 
 #include "../../../Source/Math/Matrix.h"
 #include "../../../Source/Math/CPUMatrix.h"

--- a/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
+++ b/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
@ -5,7 +5,12 @@
 #include "stdafx.h"
 #include "File.h"
 #include <memory>
+#ifdef _WIN32
 #include <io.h>
+#else // Linux
+#define _dup2 dup2
+#define _fileno fileno
+#endif

 #include "../../../Source/Math/MatrixQuantizerImpl.h"
 #include "../../../Source/Math/CUDAPageLockedMemAllocator.h"
--- a/Tests/UnitTests/MathTests/TensorTests.cpp
+++ b/Tests/UnitTests/MathTests/TensorTests.cpp
@ -0,0 +1,105 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+//
+#include "stdafx.h"
+#include "TensorView.h"
+#include "Sequences.h"
+#include "TensorTestsHelper.h"
+
+using namespace Microsoft::MSR::CNTK;
+
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
+
+BOOST_AUTO_TEST_SUITE(MathTensorTests)
+
+BOOST_AUTO_TEST_CASE(ElementwiseAddition)
+{
+    Test::TensorTest<float> tensorTester;
+
+    // --- elementwise
+
+    // elementwise sum
+    tensorTester.OneTensorTest("elementwise addition", 1e-8, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BroadcastingTest(TensorShape{ 512, 256 }, TensorShape({ 512, 256 }), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(AdditionWithSimpleBroadcasting)
+{
+    Test::TensorTest<float> tensorTester;
+
+    // --- broadcasting
+
+    // simple broadcasting
+    tensorTester.OneTensorTest("addition wth simple broadcasting", 1e-8, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BroadcastingTest(TensorShape{ 3, 2 }, TensorShape({ 3, 1 }), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(BiasAddition)
+{
+    Test::TensorTest<float> tensorTester;
+
+    // typical bias for convolutional layer
+    tensorTester.OneTensorTest("bias addition (broadcasting)", 1e-8, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BroadcastingTest(TensorShape{ 28, 28, 128, 32 }, TensorShape({ 1, 1, 128 }), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(BiasAddition2)
+{
+    Test::TensorTest<float> tensorTester;
+    // BUGBUG: This test is strange--Print() shows different values with depth 128 instead of 64, but IsEqual() does not fail with 1e-3 tolerance.
+    //         Something fishy going on. Dimension overflow?
+    tensorTester.OneTensorTest("bias addition (broadcasting)", 1e-8, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BroadcastingTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(BiasGradient)
+{
+    Test::TensorTest<float> tensorTester;
+    // --- reduction
+
+    // typical bias gradient (reduction) for FF-DNN
+    tensorTester.OneTensorTest("bias gradient (reduction)", 1e-4, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BiasGradientTest(TensorShape{ 2048, 1024 }, TensorShape(2048), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(BiasGradient2)
+{
+    Test::TensorTest<float> tensorTester;
+
+    // typical bias gradient (reduction) for convolutional layer
+    tensorTester.OneTensorTest("bias gradient (reduction)", 1e-1, [&tensorTester](DEVICEID_TYPE deviceId)
+    {
+        return tensorTester.BiasGradientTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId);
+    });
+}
+
+BOOST_AUTO_TEST_CASE(ColumnSliceMultAndAdd)
+{
+    ColumnSliceMultAndAddTest<float>(2048, 2048, 256, 0);
+}
+
+BOOST_AUTO_TEST_CASE(RnnForwardProp)
+{
+    TestRnnForwardPropSRP<float>();
+}
+
+BOOST_AUTO_TEST_CASE(OldRnnForwardProp)
+{
+    TestOldRnnForwardPropSRP<float>();
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+} } } }
--- a/Tests/UnitTests/MathTests/TensorTestsHelper.h
+++ b/Tests/UnitTests/MathTests/TensorTestsHelper.h
@ -0,0 +1,310 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Matrix.h"
+#include "CPUMatrix.h"
+#include "TensorView.h"
+#include "Sequences.h"
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
+
+template <class ElemType>
+struct TensorTest
+{
+    // run one test for both GPU and CPU and verify they are the same
+    template<typename FN>
+    void OneTensorTest(const char* what, double tolerance, const FN& fn)
+    {
+        fprintf(stderr, "===== Tensor test '%s'\n", what);
+
+        // run on GPU and CPU
+        let resultGPU = fn(0);
+        let resultCPU = fn(-1);
+
+        // dump top corner of the result to get a feel for the error
+        resultGPU.GetSOB().Print("GPU result", 0, 7, 0, 9);
+        resultGPU.GetSOB().TransferToDeviceIfNotThere(-1, true, false, true);
+        resultCPU.GetSOB().Print("CPU result", 0, 7, 0, 9);
+
+        BOOST_CHECK(resultGPU.GetSOB().IsEqualTo(resultCPU.GetSOB(), (ElemType)tolerance));
+    }
+
+    // helper to create a randomly initialized tensor object
+    TensorView<ElemType> CreateTensor(TensorShape shape, int randomSeed, DEVICEID_TYPE deviceId, bool isResult = false)
+    {
+        let numElements = shape.GetNumElements();
+
+        if (isResult)
+            cout << " ->";
+        cout << " [" << string(shape) << "]";
+        if (isResult)
+            cout << " \t// " << (deviceId < 0 ? "C" : "G") << "PU\n   " << flush;
+
+        // random init
+        mt19937 rng(randomSeed);
+        uniform_real_distribution<float> nd(-1, 1);
+        vector<ElemType> init(numElements);
+        generate(begin(init), end(init), [&] { return nd(rng); });
+
+        // create storage object (one-column matrix)
+        let sob = make_shared<Matrix<ElemType>>(numElements/*rows*/, 1/*cols*/, init.data(), deviceId);
+
+        // create TensorView
+        return TensorView<ElemType>(sob, shape);
+    }
+
+    // test bias gradient (reduction)
+    TensorView<ElemType> BiasGradientTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId)
+    {
+        int randomSeed = 1;
+        let  gradient = CreateTensor(layerShape, randomSeed++, deviceId);
+        auto bias = CreateTensor(biasShape, randomSeed++, deviceId, true);
+        //gradient.GetSOB().Print("incoming gradient", 0, 9, 0, 9);
+        //bias.GetSOB().Print("bias gradient", 0, 9, 0, 9);
+        bias.DoCopyOf(1, gradient, 1);
+        //bias.GetSOB().Print("updated bias gradient", 0, 9, 0, 9);
+        return bias;
+    }
+
+    // test broadcast summation gradient
+    TensorView<ElemType> BroadcastingTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId)
+    {
+        int randomSeed = 1;
+        let  input = CreateTensor(layerShape, randomSeed++, deviceId);
+        auto bias = CreateTensor(biasShape, randomSeed++, deviceId);
+        //input.GetSOB().Print("input data", 0, 9, 0, 9);
+        //bias.GetSOB().Print("bias", 0, 9, 0, 9);
+        auto result = CreateTensor(layerShape, randomSeed++, deviceId, true);
+        result.AssignSumOf(input, bias);
+        return result;
+    }
+};
+
+template <class ElemType>
+void SetToInitStateValueForResetSeg(const Matrix<ElemType>& sentenceBegin, size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
+{
+    Matrix<ElemType> colSeg(sentenceBegin.GetDeviceId());
+    colSeg.Resize(nStream, nStream);
+    size_t nStateRow = newprevstate.GetNumRows();
+
+    assert(nStream == sentenceBegin.GetNumRows());
+
+    // only set state to init state value for segmentation = 0, and -1
+    // e.g., -1 0 1 -> 0 0 1 -> 0 0 -1 -> 1 1 0
+
+    Matrix<ElemType> colPos(sentenceBegin.GetDeviceId());
+    colPos.SetValue(sentenceBegin);                                                     // -1 0 1
+    colPos.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
+    Matrix<ElemType>::Scale((ElemType)-1.0, colPos);
+    colPos += 0; // (int)MinibatchPackingFlags::None; // TODO: these flags no longer exist, this test probably no longer applies
+    colSeg.SetDiagonalValue(colPos);
+    Matrix<ElemType> ones(sentenceBegin.GetDeviceId());
+    ones.Resize(nStateRow, nStream);
+    ones.SetValue((ElemType)1);
+    // add default state value if it is for reset
+    Matrix<ElemType>::MultiplyAndWeightedAdd(initStateValue, ones, false, colSeg, false, 1.0, newprevstate); // += [0 initStateValue 0 ]
+}
+
+template <class ElemType>
+void rnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& colBegin, const Matrix<ElemType>& needToCompute)
+{
+    size_t ncol = functionValues.GetNumCols();
+    size_t ntime = ncol / mNbr;
+    Matrix<ElemType> out = functionValues.ColumnSlice(0, mNbr);
+    Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
+
+    for (size_t d = 0; d < ntime; d++)
+    {
+        if (d == 0)
+            inp = pastActivity.ColumnSlice(d, mNbr);
+        else
+            inp = inputFunctionValues.ColumnSlice(d, mNbr);
+
+        if (needToCompute.ColumnSlice(d, 1).Get00Element() == 1)
+        {
+            Matrix<ElemType> colSegPastActivity((DEVICEID_TYPE)functionValues.GetDeviceId());
+            Matrix<ElemType> colSeg((DEVICEID_TYPE)functionValues.GetDeviceId());
+            colSeg.Resize(mNbr, mNbr);
+            colSeg.SetValue(0);
+            colSegPastActivity.SetValue(colBegin);
+            colSegPastActivity.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
+            colSeg.SetDiagonalValue(colSegPastActivity);
+            Matrix<ElemType>::Multiply(inp, false, colSeg, false, out);
+            ElemType initStateValue = (ElemType) 0.1;
+            SetToInitStateValueForResetSeg<ElemType>(colBegin, mNbr, initStateValue, out);
+        }
+    }
+}
+
+template <class ElemType>
+void oldRNNForwardPropSRP(const size_t timeIdxInSeq, const int delay, const bool reset, const ElemType default_activity, Matrix<ElemType>& functionValues, const Matrix<ElemType>& pastActivity, const Matrix<ElemType>& inputFunctionValues, const size_t indexInBatch, const size_t mNbr)
+{
+    assert(delay > 0);
+
+    if (functionValues.GetNumRows() != inputFunctionValues.GetNumRows() ||
+        functionValues.GetNumCols() != inputFunctionValues.GetNumCols())
+        functionValues.Resize(inputFunctionValues.GetNumRows(),
+        inputFunctionValues.GetNumCols());
+
+    int iPastIndex = (int)((int)timeIdxInSeq - (int)delay) * (int)mNbr;
+    int d = iPastIndex;
+    if (d < 0)
+        d = (int)functionValues.Mod((float)iPastIndex, (float)pastActivity.GetNumCols());
+    // this can point to the past activity of the previous mninibatch
+
+    Matrix<ElemType> out = functionValues.ColumnSlice(timeIdxInSeq * mNbr + indexInBatch, 1);
+    Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
+
+    if (reset)
+        out.SetValue(default_activity);
+    else
+    {
+        if (iPastIndex < 0)
+            inp = pastActivity.ColumnSlice(d + indexInBatch, 1);
+        else
+            inp = inputFunctionValues.ColumnSlice(d + indexInBatch, 1);
+        out.AssignValuesOf(inp);
+    }
+}
+
+template <class ElemType>
+void oldRnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues)
+{
+    size_t ncol = functionValues.GetNumCols();
+    size_t ntime = ncol / mNbr;
+    for (size_t timeIdxInSeq = 0; timeIdxInSeq < ntime; timeIdxInSeq++)
+    {
+        for (size_t i = 0; i < mNbr; i++)
+        {
+            bool reset = false;
+
+            if (timeIdxInSeq == 0)
+            {
+                reset = true;
+            }
+            oldRNNForwardPropSRP<ElemType>(timeIdxInSeq, 1, reset, (ElemType) 0.1, functionValues, pastActivity, inputFunctionValues, i, mNbr);
+        }
+    }
+}
+
+template <class ElemType>
+void ColumnSliceMultAndAddTest(int n, int k, int m, DEVICEID_TYPE deviceID)
+{
+    Matrix<ElemType> AG((size_t)n, (size_t)k, deviceID);
+    AG.SetUniformRandomValue(-1, 1);
+
+    Matrix<ElemType> BG((size_t)k, (size_t)m, deviceID);
+    BG.SetUniformRandomValue(-1, 1);
+
+    Matrix<ElemType> CG((size_t)n, (size_t)m, deviceID);
+    Matrix<ElemType> DG((size_t)n, (size_t)m, deviceID);
+
+    auto t_startG = clock();
+    Matrix<ElemType>::MultiplyAndAdd(AG, false, BG, false, CG);
+    auto t_endG = clock();
+
+    fprintf(stderr, "MultiplyAndAdd Directly:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+
+    t_startG = clock();
+    for (int i = 0; i < m; i++)
+    {
+        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
+        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
+        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
+    }
+    t_endG = clock();
+    fprintf(stderr, "MultiplyAndAdd With ColumnSlice:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+
+    t_startG = clock();
+    for (int i = 0; i < m; i++)
+    {
+        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
+        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
+        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
+    }
+    t_endG = clock();
+    fprintf(stderr, "MultiplyAndAdd With ColumnSlice&:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+
+    Matrix<ElemType> col_BG1(0), col_CG1(0);
+    t_startG = clock();
+    for (int i = 0; i < m; i++)
+    {
+        col_BG1.AssignColumnSlice(BG, i, 1);
+        col_CG1.AssignColumnSlice(CG, i, 1);
+        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG1, false, col_CG1);
+    }
+    t_endG = clock();
+    fprintf(stderr, "MultiplyAndAdd With AssignColumnSlice:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+
+    t_startG = clock();
+    for (int i = 0; i < m; i++)
+    {
+        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
+        Matrix<ElemType> col_DG = DG.ColumnSlice(i, 1);
+        col_DG.AssignSigmoidOf(col_CG);
+    }
+    t_endG = clock();
+    fprintf(stderr, "AssignSigmoidOf With ColumnSlice:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+
+    t_startG = clock();
+    for (int i = 0; i < m; i++)
+    {
+        col_BG1.AssignColumnSlice(BG, i, 1);
+        col_CG1.AssignColumnSlice(CG, i, 1);
+        col_BG1.AssignSigmoidOf(col_CG1);
+    }
+    t_endG = clock();
+    fprintf(stderr, "AssignSigmoidOf With AssignColumnSlice:  %f seconds\n", 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC);
+}
+
+template <class ElemType>
+void TestRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
+{
+    Matrix<ElemType> functionValues(deviceID);
+    Matrix<ElemType> colBegin(deviceID);
+    Matrix<ElemType> pastActivity(deviceID);
+    Matrix<ElemType> inputFunctionValues(deviceID);
+    Matrix<ElemType> needToCompute(deviceID);
+
+    functionValues.Resize(nRow, nCol);
+    colBegin.Resize(mNbr, 1);
+    pastActivity.Resize(nRow, nCol);
+    inputFunctionValues.Resize(nRow, nCol);
+    needToCompute.Resize(1, nCol / mNbr);
+    needToCompute.SetValue(0);
+    needToCompute.ColumnSlice(0, 1).SetValue(1);
+    auto t_start = clock();
+    rnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues, colBegin, needToCompute);
+    auto t_end = clock();
+    fprintf(stderr, "testRnnForwardPropSRP:  %f seconds\n", 1.0 * (t_end - t_start) / CLOCKS_PER_SEC);
+}
+
+/**
+The old way of resetting RNN state, which used if statement. Also only supports up to two sentences within a minibatch
+*/
+template <class ElemType>
+void TestOldRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
+{
+    Matrix<ElemType> functionValues(deviceID);
+    Matrix<ElemType> colBegin(deviceID);
+    Matrix<ElemType> pastActivity(deviceID);
+    Matrix<ElemType> inputFunctionValues(deviceID);
+
+    functionValues.Resize(nRow, nCol);
+    colBegin.Resize(mNbr, 1);
+    pastActivity.Resize(nRow, nCol);
+    inputFunctionValues.Resize(nRow, nCol);
+    auto t_start = clock();
+    oldRnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues);
+    auto t_end = clock();
+    fprintf(stderr, "TestOldRnnForwardPropSRP:  %f seconds\n", 1.0 * (t_end - t_start) / CLOCKS_PER_SEC);
+}
+}}}}
--- a/Tests/UnitTests/MathTests/common.h
+++ b/Tests/UnitTests/MathTests/common.h
@ -15,14 +15,6 @@ struct Err
    static const T Rel;
    static const T Abs;
 };
-template <>
-const float Err<float>::Rel = 1e-5f;
-template <>
-const double Err<double>::Rel = 1e-5f;
-template <>
-const float Err<float>::Abs = 1.192092896e-07f;
-template <>
-const double Err<double>::Abs = 2.2204460492503131e-016;

 bool AreEqual(float a, float b, float maxRelError, float maxAbsError);
 bool AreEqual(double a, double b, double maxRelError, double maxAbsError);
--- a/Tests/UnitTests/MathTests/constants.cpp
+++ b/Tests/UnitTests/MathTests/constants.cpp
@ -3,6 +3,7 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
 #include "stdafx.h"
+#include "common.h"

 const int c_deviceIdZero = 0;

@ -13,3 +14,12 @@ const float c_epsilonFloatE1 = 0.1f;
 const float c_epsilonFloat5E4 = 0.0005f;
 const float c_epsilonFloatE5 = 0.00001f;
 const double c_epsilonDoubleE11 = 0.00000000001;
+
+template <>
+const float Microsoft::MSR::CNTK::Test::Err<float>::Rel = 1e-5f;
+template <>
+const double Microsoft::MSR::CNTK::Test::Err<double>::Rel = 1e-5f;
+template <>
+const float Microsoft::MSR::CNTK::Test::Err<float>::Abs = 1.192092896e-07f;
+template <>
+const double Microsoft::MSR::CNTK::Test::Err<double>::Abs = 2.2204460492503131e-016;
--- a/Tests/UnitTests/MathTests/stdafx.h
+++ b/Tests/UnitTests/MathTests/stdafx.h
@ -14,8 +14,16 @@
 #endif
 #define _SCL_SECURE_NO_WARNINGS // current API of matrix does not allow safe invokations. TODO: change api to proper one.

+#ifdef _WIN32
 #include "targetver.h"
+#endif
+
 #include <array>
+
+#ifndef _WIN32
+#define BOOST_TEST_DYN_LINK
+#endif
 #include <boost/test/unit_test.hpp>
+
 #include "constants.h"
 #include "fixtures.h"
--- a/Tests/UnitTests/NetworkTests/Common/NetworkTestHelper.h
+++ b/Tests/UnitTests/NetworkTests/Common/NetworkTestHelper.h
@ -70,7 +70,7 @@ struct DataFixture
                    if (!envVariableErrorMessage.empty())
                    {
                        BOOST_TEST_MESSAGE(envVariableErrorMessage);
-                        fprintf(stderr, envVariableErrorMessage.c_str());
+                        fprintf(stderr, "%s", envVariableErrorMessage.c_str());
                    }

                    newCurrentPath = m_testDataPath;
--- a/Tests/UnitTests/NetworkTests/stdafx.h
+++ b/Tests/UnitTests/NetworkTests/stdafx.h
@ -8,7 +8,15 @@
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms
 #define _SCL_SECURE_NO_WARNINGS // current API of matrix does not allow safe invokations. TODO: change api to proper one.

+#ifdef _WIN32
 #include "targetver.h"
-#include "basics.h"
+#endif 
+
+#include "Basics.h"
 #include "BrainScriptParser.h"
+
+#ifndef _WIN32
+// Use dynamic library on Linux
+#define BOOST_TEST_DYN_LINK
+#endif
 #include <boost/test/unit_test.hpp>
--- a/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
@ -4,7 +4,14 @@
 //
 #include "stdafx.h"
 #include <algorithm>
+#ifdef _WIN32
 #include <io.h>
+#else // On Linux
+#define _dup2 dup2
+#define _dup dup
+#define _close close
+#define _fileno fileno
+#endif
 #include <cstdio>
 #include <boost/scope_exit.hpp>
 #include "Common/ReaderTestHelper.h"
--- a/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
+++ b/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
@ -64,13 +64,13 @@ struct ReaderFixture
                    if (!envVariableErrorMessage.empty())
                    {
                        BOOST_TEST_MESSAGE(envVariableErrorMessage);
-                        fprintf(stderr, envVariableErrorMessage.c_str());
+                        fprintf(stderr, "%s\n", envVariableErrorMessage.c_str());
                    }

                    newCurrentPath = m_testDataPath;
                }
            }
-            else if ((subPath[0] == '/' && subPath[1] == '//') || (subPath[0] == '\\' && subPath[1] == '\\'))
+            else if ((subPath[0] == '/' && subPath[1] == '/') || (subPath[0] == '\\' && subPath[1] == '\\'))
            {
                newCurrentPath = subPath;
            }
@ -295,16 +295,22 @@ struct ReaderFixture
    // readerSectionName    : the reader field name in the test section

    shared_ptr<DataReader> GetDataReader(
-        const string configFileName,
-        const string testSectionName,
-        const string readerSectionName)
+        const std::string& configFileName,
+        const std::string& testSectionName,
+        const std::string& readerSectionName,
+        std::vector<std::wstring> additionalConfigParameters)
    {
        std::wstring configFN(configFileName.begin(), configFileName.end());
        std::wstring configFileCommand(L"configFile=" + configFN);
+        std::wstring cntk(L"CNTK");
+        std::vector<wchar_t*> arg{ &cntk[0], &configFileCommand[0] };
+        for(auto& p : additionalConfigParameters)
+        {
+            arg.push_back(&p[0]);
+        }

-        wchar_t* arg[2]{L"CNTK", &configFileCommand[0]};
        ConfigParameters config;
-        const std::string rawConfigString = ConfigParameters::ParseCommandLine(2, arg, config);
+        const std::string rawConfigString = ConfigParameters::ParseCommandLine((int)arg.size(), &arg[0], config);

        config.ResolveVariables(rawConfigString);
        const ConfigParameters simpleDemoConfig = config(testSectionName);
@ -344,14 +350,15 @@ struct ReaderFixture
        size_t numSubsets,
        bool sparseFeatures = false,
        bool sparseLabels = false,
-        bool useSharedLayout = true)
+        bool useSharedLayout = true,
+        std::vector<std::wstring> additionalConfigParameters = {})
    {
        shared_ptr<StreamMinibatchInputs> inputsPtr =
            CreateStreamMinibatchInputs<ElemType>(numFeatureFiles, numLabelFiles,
            sparseFeatures, sparseLabels, useSharedLayout);

        shared_ptr<DataReader> readerPtr = GetDataReader(configFileName,
-            testSectionName, readerSectionName);
+            testSectionName, readerSectionName, additionalConfigParameters);

        // Perform the data reading
        HelperWriteReaderContentToFile<ElemType>(testDataFilePath, *readerPtr, *inputsPtr,
@ -391,11 +398,12 @@ struct ReaderFixture
        size_t numSubsets,
        bool sparseFeatures = false,
        bool sparseLabels = false,
-        bool useSharedLayout = true)
+        bool useSharedLayout = true,
+        std::vector<std::wstring> additionalConfigParameters = {})
    {
        HelperReadInAndWriteOut<ElemType>(configFileName, testDataFilePath, testSectionName, readerSectionName,
            epochSize, mbSize, epochs, numFeatureFiles, numLabelFiles, subsetNum,numSubsets,
-            sparseFeatures, sparseLabels, useSharedLayout);
+            sparseFeatures, sparseLabels, useSharedLayout, additionalConfigParameters);

        CheckFilesEquivalent(controlDataFilePath, testDataFilePath);
    }
@ -408,10 +416,11 @@ struct ReaderFixture
    void HelperRunReaderTestWithException(
        string configFileName,
        string testSectionName,
-        string readerSectionName)
+        string readerSectionName,
+        std::vector<std::wstring> additionalConfigParameters = {})
    {
        BOOST_CHECK_THROW(
-            GetDataReader(configFileName,testSectionName, readerSectionName),
+            GetDataReader(configFileName, testSectionName, readerSectionName, additionalConfigParameters),
            ExceptionType);
    }
 };
--- a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderIVectorSimple_Config.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderIVectorSimple_Config.cntk
@ -0,0 +1,38 @@
+RootDir = .
+DataDir = $RootDir$
+
+# deviceId = -1 for CPU, >= 0 for GPU devices
+deviceId = -1
+
+precision = "double"
+
+Simple_Test = [
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = 450000
+        verbosity = 0
+
+        features1 = [
+            dim = 40
+            contextWindow=1
+            type = "real"
+            scpFile = "$DataDir$/features.rscp"
+        ]
+   
+        features2 = [
+            dim = 100
+            scpFile = "$DataDir$/ivector.rscp"
+            type = "real"
+            expandToUtterance = true
+        ]
+
+        labels = [
+            mlfFile = "$DataDir$/labels.smlf"
+            labelMappingFile = "$DataDir$/labels.statelist"
+            labelDim = 9000
+            labelType = "category"
+        ]
+    ]
+]
--- a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderNoIVectorSimple_Config.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderNoIVectorSimple_Config.cntk
@ -0,0 +1,32 @@
+RootDir = .
+DataDir = $RootDir$
+
+# deviceId = -1 for CPU, >= 0 for GPU devices
+deviceId = -1
+
+precision = "double"
+
+Simple_Test = [
+    reader = [
+        minimizeReaderMemoryFootprint=false
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = 450000
+        verbosity = 0
+
+        features = [
+            dim = 40
+            contextWindow=1
+            type = "real"
+            scpFile = "$DataDir$/features.rscp"
+        ]
+   
+        labels = [
+            mlfFile = "$DataDir$/labels.smlf"
+            labelMappingFile = "$DataDir$/labels.statelist"
+            labelDim = 9000
+            labelType = "category"
+        ]
+    ]
+]
--- a/Tests/UnitTests/ReaderTests/Control/HTKMLFReaderIVectorBpttSimple_Control.txt
+++ b/Tests/UnitTests/ReaderTests/Control/HTKMLFReaderIVectorBpttSimple_Control.txt
--- a/Tests/UnitTests/ReaderTests/Control/HTKMLFReaderIVectorSequenceSimple_Control.txt
+++ b/Tests/UnitTests/ReaderTests/Control/HTKMLFReaderIVectorSequenceSimple_Control.txt
--- a/Показать больше
+++ b/Показать больше