Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/fixLinuxMultiMachineTrainingIssue

Conflicts: MachineLearning/CNTKSGDLib/AllReduceDistGradAggregator.h MachineLearning/CNTKSGDLib/IDistGradAggregator.h MachineLearning/CNTKSGDLib/SGD.cpp
2015-09-09 11:36:19 -07:00 · 2015-09-09 11:36:19 -07:00 · d1488ac896
--- a/BrainScript/BrainScriptEvaluator.cpp
+++ b/BrainScript/BrainScriptEvaluator.cpp
@ -595,6 +595,7 @@ namespace Microsoft { namespace MSR { namespace BS {
            else if (what == L"Replace")
                us = Replace(arg, config[L"replacewhat"], config[L"withwhat"]);
            else
+                // TODO: this should become whatArg.Fail(...)
                throw EvaluationError(L"unknown 'what' value to StringFunction: " + what, whatArg.GetLocation());
        }
    };
--- a/BrainScript/BrainScriptEvaluator.h
+++ b/BrainScript/BrainScriptEvaluator.h
@ -1,5 +1,14 @@
 // BrainScriptEvaluator.h -- execute what's given in a config file

+// TODO: abstract this out from BrainScript --> ConfigurableObjects.h, merged with BrainScriptObjects.h
+// This is to allow alternate parsers and glue languages such as Python or .Net.
+// The only interdependency with BrainScript currently is through TextLocation.
+// -> replace TextLocation with a lambda fail() that is called to report errors.
+// That lambda would be set by BrainScript, but in a different way by different glue integrations.
+// Consumers of this should, instad of calling GetLocation(), call Fail() on that object.
+// Where we now pass a location to a derived expression, we'd now instead pass on that lambda itself.
+// This is only needed for our magic understanding of ComputationNode.
+
 #pragma once

 #include "Basics.h"
@ -53,6 +62,7 @@ namespace Microsoft { namespace MSR { namespace BS {
    //     - ConfigArrays elements
    //     - ConfigLambdas (default values of named arguments)

+    // TODO: separate this out from BrainScript to an interface that still does type casts--possible?
    class ConfigValuePtr : public shared_ptr<Object>
    {
        TextLocation location;      // in source code
@ -324,15 +334,13 @@ namespace Microsoft { namespace MSR { namespace BS {
        // We pass rvalue references because that allows to pass Thunks.
        vector<wstring> paramNames;             // #parameters and parameter names (names are used for naming expressions only)
        NamedParams namedParams;   // lists named parameters with their default values. Named parameters are optional and thus always must have a default.
-        // TODO: are these defaults already resolved? Or Thunked and resolved upon first use?
-        // TODO: Change namedParams to a shared_ptr<map<wstring,ConfigValuePtr>>
    public:
        template<typename F>
        ConfigLambda(vector<wstring> && paramNames, NamedParams && namedParams, const F & f) : paramNames(move(paramNames)), namedParams(move(namedParams)), f(f) { }
        size_t GetNumParams() const { return paramNames.size(); }
        const vector<wstring> & GetParamNames() const { return paramNames; }    // used for expression naming
        // what this function does is call f() held in this object with the given arguments except optional arguments are verified and fall back to their defaults if not given
-        // The arguments are rvalue references, which allows us to pass Thunks, which is important to allow stuff with circular references like CBTK;s DelayedNode.
+        // The arguments are rvalue references, which allows us to pass Thunks, which is important to allow stuff with circular references like CNTK's DelayedNode.
        ConfigValuePtr Apply(vector<ConfigValuePtr> && args, NamedParams && namedArgs, const wstring & exprName)
        {
            NamedParams actualNamedArgs;
@ -366,6 +374,7 @@ namespace Microsoft { namespace MSR { namespace BS {

    // -----------------------------------------------------------------------
    // functions exposed by this module
+    // TODO: This is the only thing that should stay in an actual BrainScriptEvaluator.h.
    // -----------------------------------------------------------------------

    // understand and execute from the syntactic expression tree
--- a/CNTK.sln
+++ b/CNTK.sln
@ -3,13 +3,14 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
 VisualStudioVersion = 12.0.21005.1
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMath", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathDll", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
 		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "MachineLearning\CNTK\CNTK.vcxproj", "{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}"
 	ProjectSection(ProjectDependencies) = postProject
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5}
 		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {D667AF32-028A-4A5D-BE19-F46776F0F6B2}
 		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {9A2F2441-5972-4EA8-9215-4119FCE0FB68}
@ -17,6 +18,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "MachineLearning\CNT
 		{014DA766-B37B-4581-BC26-963EA5507931} = {014DA766-B37B-4581-BC26-963EA5507931}
 		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {62836DC1-DF77-4B98-BF2D-45C943B7DDC6}
 		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1}
 	EndProjectSection
 EndProject
@ -50,8 +52,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "DataRea
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 	EndProjectSection
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEval", "MachineLearning\CNTKEval\CNTKEval.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEvalDll", "MachineLearning\CNTKEval\CNTKEval.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}"
 	ProjectSection(ProjectDependencies) = postProject
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 	EndProjectSection
 EndProject
@ -196,6 +199,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{5F733BBA-F
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{19EE975B-232D-49F0-94C7-6F1C6424FB53}"
 	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\LSTM\baseline.cpu.txt = Tests\Speech\LSTM\baseline.cpu.txt
+		Tests\Speech\LSTM\baseline.gpu.txt = Tests\Speech\LSTM\baseline.gpu.txt
+		Tests\Speech\LSTM\baseline.windows.cpu.txt = Tests\Speech\LSTM\baseline.windows.cpu.txt
+		Tests\Speech\LSTM\baseline.windows.gpu.txt = Tests\Speech\LSTM\baseline.windows.gpu.txt
 		Tests\Speech\LSTM\cntk.config = Tests\Speech\LSTM\cntk.config
 		Tests\Speech\LSTM\lstmp-3layer_WithSelfStab.ndl = Tests\Speech\LSTM\lstmp-3layer_WithSelfStab.ndl
 		Tests\Speech\LSTM\run-test = Tests\Speech\LSTM\run-test
@ -204,6 +211,47 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{19EE975B-2
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ParseConfig", "MachineLearning\ParseConfig\ParseConfig.vcxproj", "{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKComputationNetworkLib", "MachineLearning\CNTKComputationNetworkLib\CNTKComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKSGDLib", "MachineLearning\CNTKSGDLib\CNTKSGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}"
+	ProjectSection(ProjectDependencies) = postProject
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\ParallelTraining\SimpleMultiGPU.config = Tests\ParallelTraining\SimpleMultiGPU.config
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{6D1353D6-F196-466F-B886-F16D48759B20}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\ParallelTraining\Data\SimpleDataTrain.txt = Tests\ParallelTraining\Data\SimpleDataTrain.txt
+		Tests\ParallelTraining\Data\SimpleMapping.txt = Tests\ParallelTraining\Data\SimpleMapping.txt
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "NoQuantization", "NoQuantization", "{B6725C9F-A6D2-4269-9B74-7888A90F7884}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SinglePrecision", "SinglePrecision", "{B27DD434-EECD-4EE0-A03B-1150EB87258E}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.cpu.txt = Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.cpu.txt
+		Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.gpu.txt = Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.gpu.txt
+		Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.cpu.txt = Tests\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.cpu.txt
+		Tests\ParallelTraining\NoQuantization\SinglePrecision\run-test = Tests\ParallelTraining\NoQuantization\SinglePrecision\run-test
+		Tests\ParallelTraining\NoQuantization\SinglePrecision\testcases.yml = Tests\ParallelTraining\NoQuantization\SinglePrecision\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DoublePrecision", "DoublePrecision", "{A4884465-CFBB-4A64-A9DE-690E1A63EF7E}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.cpu.txt = Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.cpu.txt
+		Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.gpu.txt = Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.gpu.txt
+		Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.cpu.txt = Tests\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.cpu.txt
+		Tests\ParallelTraining\NoQuantization\DoublePrecision\run-test = Tests\ParallelTraining\NoQuantization\DoublePrecision\run-test
+		Tests\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml = Tests\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@ -273,6 +321,14 @@ Global
 		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B}.Debug|x64.Build.0 = Debug|x64
 		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B}.Release|x64.ActiveCfg = Release|x64
 		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B}.Release|x64.Build.0 = Release|x64
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}.Debug|x64.ActiveCfg = Debug|x64
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}.Debug|x64.Build.0 = Debug|x64
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}.Release|x64.ActiveCfg = Release|x64
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}.Release|x64.Build.0 = Release|x64
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}.Debug|x64.ActiveCfg = Debug|x64
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}.Debug|x64.Build.0 = Debug|x64
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}.Release|x64.ActiveCfg = Release|x64
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -282,11 +338,14 @@ Global
 		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
@ -306,5 +365,9 @@ Global
 		{4BBF2950-3DBD-469A-AD57-6CACBEBAF541} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{5F733BBA-FE83-4668-8F83-8B0E78A36619} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{19EE975B-232D-49F0-94C7-6F1C6424FB53} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
+		{6D1353D6-F196-466F-B886-F16D48759B20} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
+		{B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
+		{B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
+		{A4884465-CFBB-4A64-A9DE-690E1A63EF7E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
 	EndGlobalSection
 EndGlobal
--- a/Common/BestGpu.cpp
+++ b/Common/BestGpu.cpp
@ -23,6 +23,8 @@
 #include <nvml.h>                   // note: expected at "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include" (Windows) and /the path you installed deployment kit/usr/include/nvidia/gdk (Linux)
 #pragma comment (lib, "nvml.lib")   // note: expected at "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib" (Windows) and /the path you installed deployment kit/usr/include/nvidia/gdk (Linux)
 #include <vector>
+#else
+int bestGPUDummy = 42;              // put something into this CPP, as to avoid a linker warning
 #endif
 #include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX

@ -120,6 +122,9 @@ private:
 // 0:2:3- an array of ids to use, (PTask will only use the specified IDs)
 // *3   - a count of GPUs to use (PTask)
 // All  - Use all the GPUs (PTask) 
+#ifdef MATH_EXPORTS
+__declspec(dllexport)
+#endif
 DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
 {
    static BestGpu* g_bestGpu = NULL;
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@ -22,10 +22,12 @@
 #else
 #define DATAREADER_API
 #endif
+
+#include "Basics.h"
 #include "Matrix.h"
+#include "commandArgUtil.h" // for ConfigParameters
 #include <map>
 #include <string>
-#include "Basics.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

--- a/Common/Include/DataWriter.h
+++ b/Common/Include/DataWriter.h
@ -25,10 +25,10 @@

 #include "Basics.h"
 #include "Matrix.h"
+#include "commandArgUtil.h" // for ConfigParameters
 #include <map>
 #include <string>

-
 namespace Microsoft { namespace MSR { namespace CNTK {

 // type of data in this section
--- a/Common/Include/File.h
+++ b/Common/Include/File.h
@ -4,6 +4,8 @@
 // </copyright>
 //
 #pragma once
+
+#include "Basics.h"
 #include <stdio.h>
 #include <string>
 #include <vector>
@ -15,6 +17,8 @@
 #include <unistd.h>
 #endif
 #include "fileutil.h"   // for f{ge,pu}t{,Text}()
+#include <fstream>      // for LoadMatrixFromTextFile() --TODO: change to using this File class
+#include <sstream>

 namespace Microsoft{ namespace MSR { namespace CNTK {

@ -240,6 +244,74 @@ public:
        return *this;
    }

+    // Read a matrix stored in text format from 'filePath' (whitespace-separated columns, newline-separated rows),
+    // and return a flat array containing the contents of this file in column-major format.
+    // filePath: path to file containing matrix in text format.
+    // numRows/numCols: after this function is called, these parameters contain the number of rows/columns in the matrix.
+    // returns: a flat array containing the contents of this file in column-major format
+    // NOTE: caller is responsible for deleting the returned buffer once it is finished using it.
+    // TODO: change to return a std::vector<ElemType>; solves the ownership issue
+    // This function does not quite fit here, but it fits elsewhere even worse. TODO: change to use File class!
+    template<class ElemType>
+    static vector<ElemType> LoadMatrixFromTextFile(const std::string filePath, size_t& numRows, size_t& numCols)
+    {
+        size_t r = 0;
+        size_t numColsInFirstRow = 0;
+
+        // NOTE: Not using the Microsoft.MSR.CNTK.File API here because it
+        // uses a buffer of fixed size, which doesn't allow very long rows.
+        // See fileutil.cpp fgetline method (std::string fgetline (FILE * f) { fixed_vector<char> buf (1000000); ... })
+        std::ifstream myfile(filePath);
+
+        // load matrix into vector of vectors (since we don't know the size in advance).
+        std::vector<std::vector<ElemType>> elements;
+        if (myfile.is_open())
+        {
+            std::string line;
+            while (std::getline(myfile, line))
+            {
+                // Break on empty line.  This allows there to be an empty line at the end of the file.
+                if (line == "")
+                    break;
+
+                istringstream iss(line);
+                ElemType element;
+                int numElementsInRow = 0;
+                elements.push_back(std::vector<ElemType>());
+                while (iss >> element)
+                {
+                    elements[r].push_back(element);
+                    numElementsInRow++;
+                }
+
+                if (r == 0)
+                    numColsInFirstRow = numElementsInRow;
+                else if (numElementsInRow != numColsInFirstRow)
+                    RuntimeError("The rows in the provided file do not all have the same number of columns: " + filePath);
+
+                r++;
+            }
+            myfile.close();
+        }
+        else
+            RuntimeError("Unable to open file");
+
+        numRows = r;
+        numCols = numColsInFirstRow;
+
+        vector<ElemType> array(numRows * numCols);
+
+        // Perform transpose when copying elements from vectors to ElemType[],
+        // in order to store in column-major format.
+        for (int i = 0; i < numCols; i++)
+        {
+            for (int j = 0; j < numRows; j++)
+                array[i * numRows + j] = elements[j][i];
+        }
+
+        return array;
+    }
+
    operator FILE*() const { return m_file; }
 };

--- a/DataReader/BinaryReader/BinaryReader.vcxproj
+++ b/DataReader/BinaryReader/BinaryReader.vcxproj
@ -72,7 +72,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -95,7 +95,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/DSSMReader/DSSMReader.vcxproj
+++ b/DataReader/DSSMReader/DSSMReader.vcxproj
@ -74,7 +74,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -97,7 +97,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/DataReader/HTKMLFReader/HTKMLFReader.vcxproj
@ -71,7 +71,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@ -91,7 +91,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
--- a/DataReader/HTKMLFReader/biggrowablevectors.h
+++ b/DataReader/HTKMLFReader/biggrowablevectors.h
@ -92,7 +92,7 @@ public:
 // ---------------------------------------------------------------------------
 // biggrowablevector -- big vector we can push_back to
 // ---------------------------------------------------------------------------
-template<typename ELEMTYPE> class biggrowablevector : public growablevectorbase<std::vector<ELEMTYPE>>
+template<class ELEMTYPE> class biggrowablevector : public growablevectorbase<std::vector<ELEMTYPE>>
 {
 public:
    biggrowablevector() : growablevectorbase<std::vector<ELEMTYPE>>::growablevectorbase (65536) { }
--- a/DataReader/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/DataReader/LMSequenceReader/LMSequenceReader.vcxproj
@ -73,7 +73,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -96,7 +96,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/DataReader/LUSequenceReader/LUSequenceReader.vcxproj
@ -73,7 +73,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -96,7 +96,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
+++ b/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
@ -74,7 +74,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -97,7 +97,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/SparsePCReader/SparsePCReader.vcxproj
+++ b/DataReader/SparsePCReader/SparsePCReader.vcxproj
@ -74,7 +74,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -97,7 +97,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/DataReader/UCIFastReader/UCIFastReader.vcxproj
+++ b/DataReader/UCIFastReader/UCIFastReader.vcxproj
@ -72,7 +72,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -95,7 +95,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKMathDll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <Profile>true</Profile>
    </Link>
--- a/MachineLearning/CNTK/CNTK.cpp
+++ b/MachineLearning/CNTK/CNTK.cpp
@ -6,6 +6,8 @@
 // cn.cpp : Defines the entry point for the console application.
 //

+// TODO: should we split all these DoXXX() up into separate commands? Mainly to separate common vs. non-standard/special ones?
+
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _

 #include "stdafx.h"
@ -36,7 +38,9 @@
 #include "ExperimentalNetworkBuilder.h"
 #include "SynchronousExecutionEngine.h"
 #include "ModelEditLanguage.h"
+#include "CPUMatrix.h"  // used for SetNumThreads()
 #include "SGD.h"
+#include "MPIWrapper.h"
 #include "commandArgUtil.h"
 #include "MultiNetworksSGD.h"
 #include "SimpleEvaluator.h"
@ -165,7 +169,7 @@ void DoEvalUnroll(const ConfigParameters& config)
    net.ResetEvalTimeStamp();

    SimpleEvaluator<ElemType> eval(net);
-    ElemType evalEntropy;
+    double evalEntropy;
    eval.EvaluateUnroll(&testDataReader, mbSize[0], evalEntropy, path2EvalResults == L"" ? nullptr : path2EvalResults.c_str(), epochSize);
 }

@ -201,7 +205,7 @@ void DoCrossValidate(const ConfigParameters& config)
        evalNodeNamesVector.push_back(evalNodeNames[i]);
    }

-    std::vector<std::vector<ElemType>> cvErrorResults;
+    std::vector<std::vector<double>> cvErrorResults;
    std::vector<std::wstring> cvModels;

    DataReader<ElemType> cvDataReader(readerConfig);
@ -231,8 +235,7 @@ void DoCrossValidate(const ConfigParameters& config)
        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);

        fprintf(stderr, "model %ls --> \n", cvModelPath.c_str());
-        std::vector<ElemType> evalErrors;
-        evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
+        auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
        cvErrorResults.push_back(evalErrors);

        ::Sleep(1000 * sleepSecondsBetweenRuns);
@ -242,9 +245,9 @@ void DoCrossValidate(const ConfigParameters& config)
    if (cvErrorResults.size() == 0)
        throw std::logic_error("No model is evaluated.");

-    std::vector<ElemType> minErrors;
+    std::vector<double> minErrors;
    std::vector<int> minErrIds;
-    std::vector<ElemType> evalErrors = cvErrorResults[0];
+    std::vector<double> evalErrors = cvErrorResults[0];
    for (int i = 0; i < evalErrors.size(); ++i)
    {
        minErrors.push_back(evalErrors[i]);
@ -474,7 +477,7 @@ void SVDConfigFileUsage()


 }
-template<typename ElemType>
+template<class ElemType>
 void  DoParameterSVD(const ConfigParameters& config)
 {
    DEVICEID_TYPE deviceID = -1;        // use CPU for SVD 
@ -730,13 +733,11 @@ void DoTrain(const ConfigParameters& config)
    if (config.Exists("NDLNetworkBuilder"))
    {
        ConfigParameters ndlNetworkBuilderConfig(config("NDLNetworkBuilder"));
-        //netBuilder = unique_ptr<IComputationNetBuilder<ElemType>>(static_cast<IComputationNetBuilder<ElemType>*>(new NDLBuilder<ElemType>(config)));
        netBuilder = unique_ptr<IComputationNetBuilder<ElemType>>(new NDLBuilder<ElemType>(ndlNetworkBuilderConfig));
    }
    else if (config.Exists("SimpleNetworkBuilder"))
    {
        ConfigParameters simpleNetworkBuilderConfig(config("SimpleNetworkBuilder"));
-        //netBuilder = unique_ptr<IComputationNetBuilder<ElemType>>(static_cast<IComputationNetBuilder<ElemType>*>(new SimpleNetworkBuilder<ElemType>(config)));
        netBuilder = unique_ptr<IComputationNetBuilder<ElemType>>(new SimpleNetworkBuilder<ElemType>(simpleNetworkBuilderConfig));
    }
    else if (config.Exists("ExperimentalNetworkBuilder"))   // for testing/early access to NDL extensions
--- a/MachineLearning/CNTK/CNTK.vcxproj
+++ b/MachineLearning/CNTK/CNTK.vcxproj
@ -49,14 +49,14 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>..\CNTKSGDLib;..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>..\CNTKSGDLib;..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
@ -78,9 +78,9 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKSGDLib.lib; CNTKComputationNetworkLib.lib; CNTKMathDll.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
-      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMathDll.dll; msmpi.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PostBuildEvent>
@ -120,9 +120,9 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKSGDLib.lib; CNTKComputationNetworkLib.lib; CNTKMathDll.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
-      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMathDll.dll; msmpi.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
    </Link>
    <PostBuildEvent>
@ -170,34 +170,34 @@
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
    <ClInclude Include="..\..\Common\Include\hostname.h" />
    <ClInclude Include="..\..\Common\Include\minibatchsourcehelpers.h" />
-    <ClInclude Include="..\..\Common\Include\nvml.h" />
    <ClInclude Include="..\..\Common\Include\Platform.h" />
    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
-    <ClInclude Include="CompositeComputationNodes.h" />
+    <ClInclude Include="..\..\Math\Math\CPUMatrix.h" />
+    <ClInclude Include="..\..\Math\Math\Matrix.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\CompositeComputationNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\MatrixPool.h" />
+    <ClInclude Include="..\CNTKSGDLib\IComputationNetBuilder.h" />
    <ClInclude Include="AllReduceDistGradAggregator.h" />
-    <ClInclude Include="ComputationNetwork.h" />
-    <ClInclude Include="ComputationNetworkBuilder.h" />
-    <ClInclude Include="ComputationNetworkHelper.h" />
-    <ClInclude Include="ComputationNode.h" />
-    <ClInclude Include="ConvolutionalNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetwork.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetworkBuilder.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNode.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ConvolutionalNodes.h" />
    <ClInclude Include="DistGradHeader.h" />
    <ClInclude Include="IDistGradAggregator.h" />
    <ClInclude Include="MPIWrapper.h" />
-    <ClInclude Include="DecoderNode.h" />
-    <ClInclude Include="EvaluationCriterionNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\DecoderNode.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\EvaluationCriterionNodes.h" />
    <ClInclude Include="ExperimentalNetworkBuilder.h" />
-    <ClInclude Include="IComputationNetBuilder.h" />
    <ClInclude Include="IExecutionEngine.h" />
-    <ClInclude Include="InputAndParamNodes.h" />
-    <ClInclude Include="LinearAlgebraNodes.h" />
-    <ClInclude Include="MatrixPool.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\InputAndParamNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\LinearAlgebraNodes.h" />
    <ClInclude Include="ModelEditLanguage.h" />
    <ClInclude Include="MultiNetworksSGD.h" />
    <ClInclude Include="NDLNetworkBuilder.h" />
    <ClInclude Include="NDLUtil.h" />
    <ClInclude Include="NetworkDescriptionLanguage.h" />
-    <ClInclude Include="NonlinearityNodes.h" />
-    <ClInclude Include="RecurrentNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\NonlinearityNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\RecurrentNodes.h" />
    <ClInclude Include="SimpleEvaluator.h" />
    <ClInclude Include="SimpleOutputWriter.h" />
    <ClInclude Include="SGD.h" />
@ -205,13 +205,12 @@
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="SynchronousExecutionEngine.h" />
    <ClInclude Include="targetver.h" />
-    <ClInclude Include="TrainingCriterionNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\TrainingCriterionNodes.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\..\BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="..\..\BrainScript\BrainScriptParser.cpp" />
    <ClCompile Include="..\..\BrainScript\BrainScriptTest.cpp" />
-    <ClCompile Include="..\..\Common\BestGpu.cpp" />
    <ClCompile Include="..\..\Common\ConfigFile.cpp" />
    <ClCompile Include="..\..\Common\DataReader.cpp" />
    <ClCompile Include="..\..\Common\DataWriter.cpp" />
@ -223,14 +222,10 @@
    </ClCompile>
    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
    <ClCompile Include="CNTK.cpp" />
-    <ClCompile Include="ComputationNetwork.cpp" />
-    <ClCompile Include="ComputationNetworkBuilder.cpp" />
-    <ClCompile Include="ComputationNode.cpp" />
    <ClCompile Include="ExperimentalNetworkBuilder.cpp" />
    <ClCompile Include="ModelEditLanguage.cpp" />
    <ClCompile Include="NetworkDescriptionLanguage.cpp" />
    <ClCompile Include="SimpleNetworkBuilder.cpp" />
-    <ClCompile Include="Profiler.cpp" />
    <ClCompile Include="stdafx.cpp" />
    <ClCompile Include="SynchronousExecutionEngine.cpp" />
    <ClCompile Include="tests.cpp" />
--- a/MachineLearning/CNTK/CNTK.vcxproj.filters
+++ b/MachineLearning/CNTK/CNTK.vcxproj.filters
@ -19,51 +19,36 @@
    <ClCompile Include="ModelEditLanguage.cpp">
      <Filter>Model Editing</Filter>
    </ClCompile>
-    <ClCompile Include="ComputationNode.cpp">
-      <Filter>Nodes</Filter>
-    </ClCompile>
-    <ClCompile Include="SimpleNetworkBuilder.cpp">
-      <Filter>Network</Filter>
-    </ClCompile>
    <ClCompile Include="stdafx.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
    <ClCompile Include="tests.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="NetworkDescriptionLanguage.cpp">
-      <Filter>Network</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\Common\TimerUtility.cpp">
      <Filter>Common</Filter>
    </ClCompile>
    <ClCompile Include="CNTK.cpp" />
-    <ClCompile Include="..\..\Common\BestGpu.cpp">
-      <Filter>GPU Interfacing</Filter>
-    </ClCompile>
-    <ClCompile Include="ExperimentalNetworkBuilder.cpp">
-      <Filter>Experimental</Filter>
-    </ClCompile>
-    <ClCompile Include="Profiler.cpp">
-      <Filter>GPU Interfacing</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\BrainScript\BrainScriptEvaluator.cpp">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClCompile>
    <ClCompile Include="..\..\BrainScript\BrainScriptParser.cpp">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClCompile>
    <ClCompile Include="..\..\BrainScript\BrainScriptTest.cpp">
-      <Filter>Experimental</Filter>
-    </ClCompile>
-    <ClCompile Include="ComputationNetworkBuilder.cpp">
-      <Filter>Network</Filter>
-    </ClCompile>
-    <ClCompile Include="ComputationNetwork.cpp">
-      <Filter>Network</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClCompile>
    <ClCompile Include="SynchronousExecutionEngine.cpp">
-      <Filter>Evaluation</Filter>
+      <Filter>Model Building, from old NDL</Filter>
+    </ClCompile>
+    <ClCompile Include="ExperimentalNetworkBuilder.cpp">
+      <Filter>Model Building, experimental extensions</Filter>
+    </ClCompile>
+    <ClCompile Include="NetworkDescriptionLanguage.cpp">
+      <Filter>Model Building, from old NDL</Filter>
+    </ClCompile>
+    <ClCompile Include="SimpleNetworkBuilder.cpp">
+      <Filter>Model Building, Standard Models</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
@ -85,47 +70,20 @@
    <ClInclude Include="..\..\Common\Include\DataWriter.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="ComputationNetwork.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="ComputationNetworkHelper.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="IComputationNetBuilder.h">
-      <Filter>Network</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetwork.h">
+      <Filter>from CNTKComputationNetworkLib\Network</Filter>
    </ClInclude>
    <ClInclude Include="IExecutionEngine.h">
-      <Filter>Evaluation</Filter>
+      <Filter>Model Building, from old NDL</Filter>
    </ClInclude>
    <ClInclude Include="ModelEditLanguage.h">
      <Filter>Model Editing</Filter>
    </ClInclude>
-    <ClInclude Include="ComputationNode.h">
-      <Filter>Nodes</Filter>
-    </ClInclude>
-    <ClInclude Include="NDLNetworkBuilder.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="NDLUtil.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="NetworkDescriptionLanguage.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="SimpleEvaluator.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="SimpleNetworkBuilder.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="SimpleOutputWriter.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="SGD.h">
-      <Filter>Network</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNode.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
    <ClInclude Include="SynchronousExecutionEngine.h">
-      <Filter>Evaluation</Filter>
+      <Filter>Model Building, from old NDL</Filter>
    </ClInclude>
    <ClInclude Include="stdafx.h">
      <Filter>Misc</Filter>
@ -142,80 +100,107 @@
    <ClInclude Include="..\..\Common\Include\Basics.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\nvml.h">
-      <Filter>GPU Interfacing</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\Common\Include\minibatchsourcehelpers.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="..\..\Common\Include\BestGpu.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="CompositeComputationNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\CompositeComputationNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="EvaluationCriterionNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\EvaluationCriterionNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="TrainingCriterionNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\TrainingCriterionNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="NonlinearityNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\NonlinearityNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="LinearAlgebraNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\LinearAlgebraNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="ConvolutionalNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ConvolutionalNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="RecurrentNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\RecurrentNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="InputAndParamNodes.h">
-      <Filter>Nodes</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\InputAndParamNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="DecoderNode.h">
-      <Filter>Nodes</Filter>
-    </ClInclude>
-    <ClInclude Include="MultiNetworksSGD.h">
-      <Filter>Network</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\DecoderNode.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
    </ClInclude>
    <ClInclude Include="..\..\Common\CrossProcessMutex.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="ExperimentalNetworkBuilder.h">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClInclude>
    <ClInclude Include="AllReduceDistGradAggregator.h">
-      <Filter>Parallelization</Filter>
+      <Filter>from CNTKSGDLib\SGD Parallelization</Filter>
    </ClInclude>
    <ClInclude Include="DistGradHeader.h">
-      <Filter>Parallelization</Filter>
+      <Filter>from CNTKSGDLib\SGD Parallelization</Filter>
    </ClInclude>
    <ClInclude Include="IDistGradAggregator.h">
-      <Filter>Parallelization</Filter>
+      <Filter>from CNTKSGDLib\SGD Parallelization</Filter>
    </ClInclude>
    <ClInclude Include="MPIWrapper.h">
-      <Filter>Parallelization</Filter>
+      <Filter>from CNTKSGDLib\SGD Parallelization</Filter>
    </ClInclude>
    <ClInclude Include="..\..\Common\Include\Platform.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="MatrixPool.h">
-      <Filter>Evaluation</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\BrainScript\BrainScriptEvaluator.h">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClInclude>
    <ClInclude Include="..\..\BrainScript\BrainScriptObjects.h">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClInclude>
    <ClInclude Include="..\..\BrainScript\BrainScriptParser.h">
-      <Filter>Experimental</Filter>
+      <Filter>Model Building, experimental extensions</Filter>
    </ClInclude>
-    <ClInclude Include="ComputationNetworkBuilder.h">
-      <Filter>Network</Filter>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetworkBuilder.h">
+      <Filter>from CNTKComputationNetworkLib\Network</Filter>
+    </ClInclude>
+    <ClInclude Include="NDLNetworkBuilder.h">
+      <Filter>Model Building, from old NDL</Filter>
+    </ClInclude>
+    <ClInclude Include="NDLUtil.h">
+      <Filter>Model Building, from old NDL</Filter>
+    </ClInclude>
+    <ClInclude Include="NetworkDescriptionLanguage.h">
+      <Filter>Model Building, from old NDL</Filter>
+    </ClInclude>
+    <ClInclude Include="SGD.h">
+      <Filter>from CNTKSGDLib\SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="MultiNetworksSGD.h">
+      <Filter>from CNTKSGDLib\SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleEvaluator.h">
+      <Filter>from CNTKSGDLib\SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleOutputWriter.h">
+      <Filter>from CNTKSGDLib\SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\MatrixPool.h">
+      <Filter>from CNTKComputationNetworkLib\Network</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKSGDLib\IComputationNetBuilder.h">
+      <Filter>from CNTKSGDLib\SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleNetworkBuilder.h">
+      <Filter>Model Building, Standard Models</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Math\Math\Matrix.h">
+      <Filter>from CNTKMath</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Math\Math\CPUMatrix.h">
+      <Filter>from CNTKMath</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
@ -229,7 +214,7 @@
      <Filter>Misc</Filter>
    </Text>
    <Text Include="..\..\BrainScript\Notes.txt">
-      <Filter>Experimental\Doc</Filter>
+      <Filter>Model Building, experimental extensions\Doc</Filter>
    </Text>
  </ItemGroup>
  <ItemGroup>
@ -239,40 +224,52 @@
    <Filter Include="Common\Include">
      <UniqueIdentifier>{85226dda-87ba-4da6-af04-563d0ce23b94}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Network">
-      <UniqueIdentifier>{498bb2e9-53de-4955-970e-813e3f21025b}</UniqueIdentifier>
-    </Filter>
    <Filter Include="Model Editing">
      <UniqueIdentifier>{53c3735f-1374-4044-ab58-8a646c95a5e8}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Nodes">
-      <UniqueIdentifier>{0b366814-48b2-4619-bf92-85ee24e3cbc1}</UniqueIdentifier>
-    </Filter>
    <Filter Include="Misc">
      <UniqueIdentifier>{3c119a92-ffb2-4850-adae-01778324974d}</UniqueIdentifier>
    </Filter>
-    <Filter Include="GPU Interfacing">
-      <UniqueIdentifier>{8d99b2cc-5209-40e4-8b4b-a7616973ae3b}</UniqueIdentifier>
+    <Filter Include="from CNTKComputationNetworkLib">
+      <UniqueIdentifier>{7b4cb3e8-272f-413d-badd-d437779b1aeb}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Experimental">
-      <UniqueIdentifier>{fe2443a1-6323-449f-96be-cbd0f608f382}</UniqueIdentifier>
+    <Filter Include="from CNTKComputationNetworkLib\Nodes">
+      <UniqueIdentifier>{0b366814-48b2-4619-bf92-85ee24e3cbc1}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Parallelization">
+    <Filter Include="from CNTKComputationNetworkLib\Network">
+      <UniqueIdentifier>{498bb2e9-53de-4955-970e-813e3f21025b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from CNTKSGDLib">
+      <UniqueIdentifier>{d3d5900a-8c5e-45f1-a2b7-f82f0e31994d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from CNTKSGDLib\SGD Parallelization">
      <UniqueIdentifier>{8531d7fb-a673-491a-988a-012c92fafbfd}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Evaluation">
+    <Filter Include="from CNTKSGDLib\SGD">
+      <UniqueIdentifier>{4f06ac18-7b30-490c-b801-128bdaa99450}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Model Building, from old NDL">
      <UniqueIdentifier>{3ddfc109-3a90-45f5-91e8-1930759cfe9d}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Experimental\Doc">
+    <Filter Include="Model Building, Standard Models">
+      <UniqueIdentifier>{f474b73c-05f9-43e6-997f-3ec83805c655}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Model Building, experimental extensions">
+      <UniqueIdentifier>{fe2443a1-6323-449f-96be-cbd0f608f382}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Model Building, experimental extensions\Doc">
      <UniqueIdentifier>{23e7cd74-fd60-4fb4-a925-c3dea584f176}</UniqueIdentifier>
    </Filter>
+    <Filter Include="from CNTKMath">
+      <UniqueIdentifier>{ebc74fe7-4a25-46e7-87a8-121881ef9124}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="prebuild.bat">
      <Filter>Misc</Filter>
    </None>
    <None Include="..\..\BrainScript\BrainScript--extending the CNTK config language, Frank Seide August 2015.pptx">
-      <Filter>Experimental\Doc</Filter>
+      <Filter>Model Building, experimental extensions\Doc</Filter>
    </None>
  </ItemGroup>
 </Project>
--- a/MachineLearning/CNTK/ComputationNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/ComputationNetworkBuilder.cpp
@ -1,559 +0,0 @@
-// ComputationNetworkBuilder -- helper class for constructing ComputationNetworks and ComputationNodes from C++ (internal and external)
-//
-// <copyright file="ComputationNode.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "Basics.h"
-#include "ComputationNetworkBuilder.h"
-
-#include "ComputationNode.h"
-#include "InputAndParamNodes.h"
-#include "LinearAlgebraNodes.h"
-#include "NonlinearityNodes.h"
-#include "ConvolutionalNodes.h"
-#include "RecurrentNodes.h"
-#include "DecoderNode.h"
-#include "TrainingCriterionNodes.h"
-#include "CompositeComputationNodes.h"
-#include "EvaluationCriterionNodes.h"
-
-#include <string>
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    using namespace std;
-
-    // create a new node of a type given as a string, with var args so that this can be used at multiple places
-    // This function only creates nodes that accept (m_deviceId, nodeName).
-    template<typename ElemType>
-    /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring & nodeType, DEVICEID_TYPE deviceId, const wstring & name)
-    {
-        // please keep this table sorted
-        if (nodeType == CRFNode<ElemType>::TypeName())	return New<CRFNode<ElemType>>(deviceId, name);
-        else if (nodeType == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName()) return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(deviceId, name);
-        else if (nodeType == ColumnElementTimesNode<ElemType>::TypeName())  return New<ColumnElementTimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == CosDistanceNode<ElemType>::TypeName())	    return New<CosDistanceNode<ElemType>>(deviceId, name);
-        else if (nodeType == CosDistanceWithNegativeSamplesNode<ElemType>::TypeName()) return New<CosDistanceWithNegativeSamplesNode<ElemType>>(deviceId, name);
-        else if (nodeType == CosineNode<ElemType>::TypeName())	            return New<CosineNode<ElemType>>(deviceId, name);
-        else if (nodeType == CrossEntropyNode<ElemType>::TypeName())	    return New<CrossEntropyNode<ElemType>>(deviceId, name);
-        else if (nodeType == CrossEntropyWithSoftmaxNode<ElemType>::TypeName())	return New<CrossEntropyWithSoftmaxNode<ElemType>>(deviceId, name);
-        else if (nodeType == DiagTimesNode<ElemType>::TypeName())	    return New<DiagTimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == DropoutNode<ElemType>::TypeName())	            return New<DropoutNode<ElemType>>(deviceId, name);
-        else if (nodeType == DummyCriterionNode<ElemType>::TypeName())	    return New<DummyCriterionNode<ElemType>>(deviceId, name);
-        else if (nodeType == ElementTimesNode<ElemType>::TypeName())	    return New<ElementTimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == ErrorPredictionNode<ElemType>::TypeName())	    return New<ErrorPredictionNode<ElemType>>(deviceId, name);
-        else if (nodeType == ExpNode<ElemType>::TypeName())	            return New<ExpNode<ElemType>>(deviceId, name);
-        else if (nodeType == FutureValueNode<ElemType>::TypeName())	    return New<FutureValueNode<ElemType>>(deviceId, name);
-        else if (nodeType == GMMLogLikelihoodNode<ElemType>::TypeName())    return New<GMMLogLikelihoodNode<ElemType>>(deviceId, name);
-        else if (nodeType == InvStdDevNode<ElemType>::TypeName())	    return New<InvStdDevNode<ElemType>>(deviceId, name);
-        else if (nodeType == KhatriRaoProductNode<ElemType>::TypeName())    return New<KhatriRaoProductNode<ElemType>>(deviceId, name);
-        else if (nodeType == LSTMNode<ElemType>::TypeName())	            return New<LSTMNode<ElemType>>(deviceId, name);
-        else if (nodeType == LogNode<ElemType>::TypeName())	            return New<LogNode<ElemType>>(deviceId, name);
-        else if (nodeType == LogSoftmaxNode<ElemType>::TypeName())	    return New<LogSoftmaxNode<ElemType>>(deviceId, name);
-        else if (nodeType == LookupTableNode<ElemType>::TypeName())	    return New<LookupTableNode<ElemType>>(deviceId, name);
-        else if (nodeType == MatrixL1RegNode<ElemType>::TypeName())	    return New<MatrixL1RegNode<ElemType>>(deviceId, name);
-        else if (nodeType == MatrixL2RegNode<ElemType>::TypeName())	    return New<MatrixL2RegNode<ElemType>>(deviceId, name);
-        else if (nodeType == MeanNode<ElemType>::TypeName())	            return New<MeanNode<ElemType>>(deviceId, name);
-        else if (nodeType == MinusNode<ElemType>::TypeName())	            return New<MinusNode<ElemType>>(deviceId, name);
-        else if (nodeType == NegateNode<ElemType>::TypeName())	            return New<NegateNode<ElemType>>(deviceId, name);
-        else if (nodeType == NoiseContrastiveEstimationNode<ElemType>::TypeName()) return New<NoiseContrastiveEstimationNode<ElemType>>(deviceId, name);
-        else if (nodeType == PairNetworkNode<ElemType>::TypeName())	    return New<PairNetworkNode<ElemType>>(deviceId, name);
-        else if (nodeType == ParallelNode<ElemType>::TypeName())	    return New<ParallelNode<ElemType>>(deviceId, name);
-        else if (nodeType == PastValueNode<ElemType>::TypeName() || nodeType == L"Delay") return New<PastValueNode<ElemType>>(deviceId, name);
-        else if (nodeType == PerDimMeanVarDeNormalizationNode<ElemType>::TypeName() || nodeType == L"PerDimMeanVarDeNormalizationNode")	return New<PerDimMeanVarDeNormalizationNode<ElemType>>(deviceId, name);
-        else if (nodeType == PerDimMeanVarNormalizationNode<ElemType>::TypeName() || nodeType == L"PerDimMeanVarNormalizationNode")	return New<PerDimMeanVarNormalizationNode<ElemType>>(deviceId, name);
-        else if (nodeType == PlusNode<ElemType>::TypeName())	            return New<PlusNode<ElemType>>(deviceId, name);
-        else if (nodeType == RectifiedLinearNode<ElemType>::TypeName())	    return New<RectifiedLinearNode<ElemType>>(deviceId, name);
-        else if (nodeType == ReshapeNode<ElemType>::TypeName())	            return New<ReshapeNode<ElemType>>(deviceId, name);
-        else if (nodeType == RowElementTimesNode<ElemType>::TypeName())	    return New<RowElementTimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == RowRepeatNode<ElemType>::TypeName())	    return New<RowRepeatNode<ElemType>>(deviceId, name);
-        else if (nodeType == RowSliceNode<ElemType>::TypeName())	    return New<RowSliceNode<ElemType>>(deviceId, name);
-        else if (nodeType == RowStackNode<ElemType>::TypeName())	    return New<RowStackNode<ElemType>>(deviceId, name);
-        else if (nodeType == ScaleNode<ElemType>::TypeName())	            return New<ScaleNode<ElemType>>(deviceId, name);
-        else if (nodeType == SequenceDecoderNode<ElemType>::TypeName())	    return New<SequenceDecoderNode<ElemType>>(deviceId, name);
-        else if (nodeType == SigmoidNode<ElemType>::TypeName())	            return New<SigmoidNode<ElemType>>(deviceId, name);
-        else if (nodeType == SoftmaxNode<ElemType>::TypeName())	            return New<SoftmaxNode<ElemType>>(deviceId, name);
-        else if (nodeType == SquareErrorNode<ElemType>::TypeName())	    return New<SquareErrorNode<ElemType>>(deviceId, name);
-        else if (nodeType == StrideTimesNode<ElemType>::TypeName())	    return New<StrideTimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == SumColumnElementsNode<ElemType>::TypeName())   return New<SumColumnElementsNode<ElemType>>(deviceId, name);
-        else if (nodeType == SumElementsNode<ElemType>::TypeName())	    return New<SumElementsNode<ElemType>>(deviceId, name);
-        else if (nodeType == TanhNode<ElemType>::TypeName())	            return New<TanhNode<ElemType>>(deviceId, name);
-        else if (nodeType == TimeReverseNode<ElemType>::TypeName())	    return New<TimeReverseNode<ElemType>>(deviceId, name);
-        else if (nodeType == TimesNode<ElemType>::TypeName())	            return New<TimesNode<ElemType>>(deviceId, name);
-        else if (nodeType == TransposeNode<ElemType>::TypeName())	    return New<TransposeNode<ElemType>>(deviceId, name);
-        else if (nodeType == TransposeTimesNode<ElemType>::TypeName())	    return New<TransposeTimesNode<ElemType>>(deviceId, name);
-        else return nullptr;
-    }
-
-    // create a new node of a type given as a string, with var args so that this can be used at multiple places
-    // This function is used for loading, while the above is used for creating standard-type networks.
-    template<typename ElemType>
-    /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode(const std::wstring & nodeType, DEVICEID_TYPE deviceId, const wstring & name)
-    {
-        // TODO: Is this ever called with additional _Args? If not, simplify
-        // try first those that accept the standard two constructor arguments
-        auto newNode = NewStandardNode(nodeType, deviceId, name);
-        if (newNode) return newNode;
-        // check more types
-        else if (nodeType == AveragePoolingNode<ElemType>::TypeName())	     return New<AveragePoolingNode<ElemType>>(deviceId, name);
-        else if (nodeType == ConvolutionNode<ElemType>::TypeName())	     return New<ConvolutionNode<ElemType>>(deviceId, name);
-        else if (nodeType == InputValue<ElemType>::SparseTypeName())	     return New<InputValue<ElemType>>(deviceId, name, true);
-        else if (nodeType == InputValue<ElemType>::TypeName())	             return New<InputValue<ElemType>>(deviceId, name);
-        else if (nodeType == LearnableParameter<ElemType>::TypeName())	     return New<LearnableParameter<ElemType>>(deviceId, name);
-        else if (nodeType == MaxPoolingNode<ElemType>::TypeName())	     return New<MaxPoolingNode<ElemType>>(deviceId, name);
-        else if (nodeType == SparseLearnableParameter<ElemType>::TypeName()) return New<SparseLearnableParameter<ElemType>>(deviceId, name);
-        else return nullptr;
-    }
-
-    // -----------------------------------------------------------------------
-    // node creation
-    // -----------------------------------------------------------------------
-
-    // The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
-    // There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
-    // TODO: Do we really need these? Folks who want to use C++ can instead say net->AddNodeToNet(New<>(...)), which is not that different.
-    // TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols)
-    {
-        // TODO: in SimpleNetworkBuilder, this is very often followed by InitLearnableParameter()--we should have an overload that just does it right away
-        return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceID(), paramName, rows, cols));
-    }
-
-    //sparse matrix size is optionally specified
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size)
-    {
-        return net.AddNodeToNetWithElemType(New<SparseLearnableParameter<ElemType>>(net.GetDeviceID(), paramName, rows, cols, size));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
-    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, rows, cols));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
-    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, rows, cols, true));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName,
-        const size_t imageWidth,
-        const size_t imageHeight,
-        const size_t imageChannels,
-        const size_t numImages)
-    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, imageWidth, imageHeight, imageChannels, numImages));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName,
-        const size_t imageWidth,
-        const size_t imageHeight,
-        const size_t imageChannels,
-        const size_t numImages)
-    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, imageWidth, imageHeight, imageChannels, numImages, true));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols)
-    {
-        return net.AddNodeToNetWithElemType(New<PairNetworkNode<ElemType>>(net.GetDeviceID(), inputName, rows, cols));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring & nodeName,
-                                                                            const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
-                                                                            const size_t horizontalSubsample, const size_t verticalSubsample,
-                                                                            const bool zeroPadding,
-                                                                            const size_t maxTempMemSizeInSamples)
-    {
-        return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceID(), nodeName,
-            kernelWidth, kernelHeight,
-            outputChannels,
-            horizontalSubsample,
-            verticalSubsample, zeroPadding,
-            maxTempMemSizeInSamples));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring & nodeName,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample)
-    {
-        return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample));
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth,
-        const size_t windowHeight, const size_t horizontalSubsample,
-        const size_t verticalSubsample)
-    {
-        return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample));
-    }
-
-    // this is the catch-all for all cases not covered as special cases above
-    // Unlike the specialized ones above, this one creates nodes by type given as a string.
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateComputationNode(const std::wstring & nodeType, const std::wstring & nodeName)
-    {
-        return net.AddNodeToNetWithElemType(NewStandardNode(nodeType, net.GetDeviceID(), nodeName));
-    }
-
-    // -----------------------------------------------------------------------
-    // node creation
-    // -----------------------------------------------------------------------
-
-    // The following functions create nodes and link them to the network and their inputs.
-    // TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PairNetwork(const ComputationNodePtr & a, const std::wstring nodeName)
-    {
-        if (net.GetNodeFromName(a->NodeName(), nullptr, false) != nullptr)
-        {
-            fprintf(stderr, "PairNetwork: asked to pair a node with name %ls in another network. However, this network has already a node with the same name. Should avoid this case.\n", a->NodeName().c_str());
-            RuntimeError("PairNetwork: asked to pair a node with name in another network. However, this network has already a node with the same name. Should avoid this case.\n");
-        }
-        return net.AddNodeToNetAndAttachInputs(New<PairNetworkNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
-        const ComputationNodePtr inputValues,
-        const size_t kernelWidth,
-        const size_t kernelHeight,
-        const size_t outputChannels,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
-        const bool zeroPadding,
-        const std::wstring nodeName,
-        const size_t maxTempMemSizeInSamples)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceID(), nodeName,
-            kernelWidth, kernelHeight,
-            outputChannels,
-            horizontalSubsample,
-            verticalSubsample, zeroPadding,
-            maxTempMemSizeInSamples),
-            weight, inputValues);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<MaxPoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample),
-            inputValues);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::AveragePooling(const ComputationNodePtr inputValues,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<AveragePoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample),
-            inputValues);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ErrorPredictionNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean,
-        const ComputationNodePtr InvStdDev, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<PerDimMeanVarNormalizationNode<ElemType>>(net.GetDeviceID(), nodeName), feature, mean, InvStdDev);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean,
-        const ComputationNodePtr InvStdDev, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<PerDimMeanVarDeNormalizationNode<ElemType>>(net.GetDeviceID(), nodeName), feature, mean, InvStdDev);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::SquareError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<SquareErrorNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::SequenceDecoder(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr pairscore, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<SequenceDecoderNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction, pairscore);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName)
-
-    {
-        return net.AddNodeToNetAndAttachInputs(New<CrossEntropyWithSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction,
-        const ComputationNodePtr input_weight,
-        const ComputationNodePtr input_bias, const std::wstring nodeName,
-        NCEEvalMode mode)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<NoiseContrastiveEstimationNode<ElemType>>(net.GetDeviceID(), nodeName, mode), label, prediction, input_weight, input_bias);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ClassCrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction,
-        const ComputationNodePtr input_weight,
-        const ComputationNodePtr cls_log_post_prob,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction, input_weight, cls_log_post_prob);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CRF(const ComputationNodePtr label,
-        const ComputationNodePtr postDepScore,
-        const ComputationNodePtr transition_score,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<CRFNode<ElemType>>(net.GetDeviceID(), nodeName), label, postDepScore, transition_score);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<DummyCriterionNode<ElemType>>(net.GetDeviceID(), nodeName), objectives, derivatives, prediction);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LSTM(const ComputationNodePtr obs,
-        const ComputationNodePtr inputGate,
-        const ComputationNodePtr forgetGate,
-        const ComputationNodePtr outputGate,
-        const ComputationNodePtr memoryCellWgt,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<LSTMNode<ElemType>>(net.GetDeviceID(), nodeName), obs, inputGate, forgetGate, outputGate, memoryCellWgt);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<CrossEntropyNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MatrixL1Reg(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<MatrixL1RegNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MatrixL2Reg(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<MatrixL2RegNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Mean(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<MeanNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::InvStdDev(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<InvStdDevNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Negate(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<NegateNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RectifiedLinear(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<RectifiedLinearNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sigmoid(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<SigmoidNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Tanh(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<TanhNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Exp(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ExpNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Log(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<LogNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Cos(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<CosineNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Softmax(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<SoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<LogSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sum(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<SumElementsNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ScaleNode<ElemType>>(net.GetDeviceID(), nodeName), scalar, matrix);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Transpose(const ComputationNodePtr matrix, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<TransposeNode<ElemType>>(net.GetDeviceID(), nodeName), matrix);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Times(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<TimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::TransposeTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<TransposeTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<RowElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ColumnElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ColumnElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::StrideTimes(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<StrideTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b, c);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<DiagTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<CosDistanceNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<KhatriRaoProductNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<PlusNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Minus(const ComputationNodePtr a,
-        const ComputationNodePtr b,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<MinusNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Dropout(const ComputationNodePtr a, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<DropoutNode<ElemType>>(net.GetDeviceID(), nodeName), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reshape(const ComputationNodePtr a,
-        const size_t num_rows,
-        const size_t img_width,
-        const size_t img_height,
-        const size_t img_channels,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceID(), nodeName, num_rows, img_width, img_height, img_channels), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<RowRepeatNode<ElemType>>(net.GetDeviceID(), nodeName, num_repeat), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceID(), nodeName, initHiddenActivity, row_size, col_size), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceID(), nodeName, initHiddenActivity, row_size, col_size), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<ParallelNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowSlice(const ComputationNodePtr a, const size_t start_index, const size_t num_rows, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<RowSliceNode<ElemType>>(net.GetDeviceID(), nodeName, start_index, num_rows), a);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowStack(const std::vector<ComputationNodePtr> pinputs, const std::wstring nodeName)
-    {
-        vector<ComputationNodeBasePtr> inputs(pinputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-            inputs[i] = pinputs[i]; // convert to ComputationNodeBasePtr
-        return net.AddNodeToNetAndAttachInputs(New<RowStackNode<ElemType>>(net.GetDeviceID(), nodeName), inputs);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::GMMLogLikelihood(const ComputationNodePtr unnormedPrior,
-        const ComputationNodePtr mean,
-        const ComputationNodePtr logStddev,
-        const ComputationNodePtr feature,
-        const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<GMMLogLikelihoodNode<ElemType>>(net.GetDeviceID(), nodeName), unnormedPrior, mean, logStddev, feature);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::TimeReverse(const ComputationNodePtr input, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<TimeReverseNode<ElemType>>(net.GetDeviceID(), nodeName), input);
-    }
-
-    template<typename ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LookupTable(const ComputationNodePtr dictionary, const ComputationNodePtr input, const std::wstring nodeName)
-    {
-        return net.AddNodeToNetAndAttachInputs(New<LookupTableNode<ElemType>>(net.GetDeviceID(), nodeName), dictionary, input);
-    }
-
-    template class ComputationNetworkBuilder<float>;
-    template class ComputationNetworkBuilder<double>;
-
-}}}
--- a/MachineLearning/CNTK/ComputationNetworkHelper.h
+++ b/MachineLearning/CNTK/ComputationNetworkHelper.h
@ -1,81 +0,0 @@
-//
-// <copyright file="ComputationNetworkHelper.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include <fstream>
-
-#include "Basics.h"
-#include "fileutil.h"
-
-#include "ComputationNetwork.h"
-#include "NonlinearityNodes.h"  // TODO: move functions that depend on this to a .cpp file
-#include "ConvolutionalNodes.h"
-#include "DataReader.h"
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    //utility class used by SGD, outputWriter and Evaluator
-    // TODO: make independent of ElemType
-    template<class ElemType>
-    class ComputationNetworkHelper
-    {
-        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
-
-    protected:
-        void UpdateEvalTimeStamps(const std::vector<ComputationNodeBasePtr> & nodes)
-        {
-            for (size_t i=0; i<nodes.size(); i++)
-                nodes[i]->UpdateEvalTimeStamp();
-        }
-
-        void SetDropoutRate(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const ElemType dropoutRate, ElemType & prevDropoutRate, unsigned long & dropOutSeed)
-        {
-            if (dropoutRate != prevDropoutRate)
-            {
-                fprintf(stderr,"Switching dropout rate to %.8g.\n", dropoutRate);
-                std::list<ComputationNodeBasePtr> dropoutNodes = net.GetNodesWithType(DropoutNode<ElemType>::TypeName(), criterionNode);
-                if (dropoutNodes.size() == 0 && dropoutRate > 0)
-                {
-                    fprintf(stderr,"WARNING: there is no dropout node.\n");
-                }
-                else
-                {
-                    for (auto nodeIter=dropoutNodes.begin(); nodeIter != dropoutNodes.end(); nodeIter++)
-                    {
-                        auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(*nodeIter);
-                        node->SetDropoutRate(dropoutRate);
-                        node->SetRandomSeed(dropOutSeed++);
-                    }
-                }
-
-                prevDropoutRate = dropoutRate;
-            }
-        }
-
-        void SetMaxTempMemSizeForCNN(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const size_t maxTempMemSizeInSamples)
-        {
-            fprintf(stderr,"Set Max Temp Mem Size For Convolution Nodes to %lu samples.\n", maxTempMemSizeInSamples);
-            std::list<ComputationNodeBasePtr> convolutionNodes = net.GetNodesWithType(ConvolutionNode<ElemType>::TypeName(), criterionNode);
-            if (convolutionNodes.size() == 0 && maxTempMemSizeInSamples != 0)
-            {
-                fprintf(stderr,"WARNING: there is no convolution node.\n");
-            }
-            else
-            {
-                for (auto nodeIter=convolutionNodes.begin(); nodeIter != convolutionNodes.end(); nodeIter++)
-                {
-                    auto node = dynamic_pointer_cast<ConvolutionNode<ElemType>>(*nodeIter);
-                    node->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
-                }
-            }
-        }
-    };
-}}}
--- a/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp
@ -7,26 +7,27 @@
 #include "ExperimentalNetworkBuilder.h"
 #include "BrainScriptEvaluator.h"

-#include "ComputationNode.h"
-#include "InputAndParamNodes.h"
-#include "RecurrentNodes.h"
-#include "NonlinearityNodes.h"
-#include "LinearAlgebraNodes.h"
-#include "ConvolutionalNodes.h"
-
-#include "ComputationNetwork.h"
-#include "ComputationNetworkBuilder.h"
-
-#include <memory>
-#include <deque>
-#include <set>
+//#include "ComputationNode.h"
+//#include "InputAndParamNodes.h"
+//#include "RecurrentNodes.h"
+//#include "NonlinearityNodes.h"
+//#include "LinearAlgebraNodes.h"
+//#include "ConvolutionalNodes.h"
+//
+//#include "ComputationNetwork.h"
+//#include "ComputationNetworkBuilder.h"
+//
+//#include <memory>
+//#include <deque>
+//#include <set>
 #include <string>

 #ifndef let
 #define let const auto
 #endif

-namespace Microsoft { namespace MSR { namespace BS {
+
+namespace Microsoft { namespace MSR { namespace CNTK {

    using namespace Microsoft::MSR;

@ -140,755 +141,13 @@ namespace Microsoft { namespace MSR { namespace BS {
        //BinaryStandardNode(TransposeTimesNode)
    ;

-    // The following class(es) implement the MakeRuntimeObject() function for different types. Sorry for the strange template dance.
-
-    // -------------------------------------------------------------------
-    // basic function template, for classes that can instantiate themselves from IConfigRecordPtr  TODO: do we even have any?
-    // -------------------------------------------------------------------
-
-    template<typename ElemType, class C>
-    struct DualPrecisionHelpers
-    {
-        static shared_ptr<Object> MakeRuntimeObject(const IConfigRecordPtr config) { return make_shared<C>(config); }
-    };
-
-    // -------------------------------------------------------------------
-    // ComputationNode -- covers all standard nodes
-    // -------------------------------------------------------------------
-
-    // helper wrapper class for ComputationNodes that must AttachInputs() late due to circular references
-    // Instantiate with LateAttachingNode<node type>(lambda, args for node constructor).
-    // To resolve, call AttachInputs()
-    // TODO: This is a bit indirect. Can it be done more nicely?
-    struct ILateAttachingNode { virtual void LateAttachInputs() = 0; };
-    template<class N>
-    class LateAttachingNode : public N, public ILateAttachingNode
-    {
-        typedef typename N::OurElemType ElemType;
-        function<void(ComputationNode<ElemType>*)> attachInputs;
-    public:
-        // constructor
-        template<class... _Types>
-        LateAttachingNode(DEVICEID_TYPE deviceId, const wstring & name, const function<void(ComputationNode<ElemType>*)> & attachInputs, _Types&&... _Args) : attachInputs(attachInputs), N(deviceId, name, forward<_Types>(_Args)...) {}
-        // the one member that does the work
-        void /*ILateAttachingNode::*/LateAttachInputs()
-        {
-            attachInputs(dynamic_cast<N*>(this));
-            attachInputs = [](ComputationNode<ElemType>*){ LogicError("LateAttachingNode::AttachInputs: must only be called once"); };
-        }
-    };
-
-    template<typename ElemType>
-    struct DualPrecisionHelpers<ElemType, ComputationNode<ElemType>>
-    {
-        // create ComputationNode
-        // This is the equivalent of the old SynchronousNodeEvaluator::Evaluate(), and we duplicate code from there.
-        static shared_ptr<Object> MakeRuntimeObject(const IConfigRecordPtr configp)
-        {
-            let & config = *configp;
-            wstring operationName = config[L"operation"];
-            wstring nodeName = L"<placeholder>";   // name will be overwritten by caller upon return (TODO: fix this here? pass expression name in?)
-            DEVICEID_TYPE deviceId = (DEVICEID_TYPE)(int)config[L"deviceId"];
-            static unsigned long m_randomSeedOffset = 0;    // TODO: this is held in the ComputationNetwork, but we don't have one yet
-            // TODO" ^^ actually it seems only used by initialization of LearnableParameters--check that again; in that case, we can have a local
-
-            // note on optional parameters
-            // Instead of defining optional parameters here in code, they are defined as optional args to the creating macro.
-
-            ComputationNodeBasePtr node;
-
-#define OpIs(op) (operationName == msra::strfun::utf16(op<ElemType>::TypeName()))
-
-            // TODO: in the code below, for reference, each block is preceded by an #if-0'ed out copy of the respective code from SynchronousNodeEvaluator::Evaluate()--remove these when this all works
-
-            // first group: nodes without inputs
-#if 0
-            if (InputValue<ElemType>::TypeName() == cnNodeType)
-            {
-                if (parameter.size() < 1 || parameter.size() > 2)
-                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                    // first look for this node already existing in the network
-                    if (m_net.NodeNameExist(name))
-                        nodePtr = m_net.GetNodeFromName(name);
-                    else
-                        nodePtr = m_net.CreateInputNode(name, rows, cols);
-                }
-            }
-            else if (InputValue<ElemType>::SparseTypeName() == cnNodeType)
-            {
-                if (parameter.size() < 1 || parameter.size() > 2)
-                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                    // first look for this node already existing in the network
-                    if (m_net.NodeNameExist(name))
-                        nodePtr = m_net.GetNodeFromName(name);
-                    else
-                        nodePtr = m_net.CreateSparseInputNode(name, rows, cols);
-                }
-            }
-            else if (cnNodeType == L"ImageInput")
-            {
-                if (parameter.size() < 3 || parameter.size() > 4)
-                    RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
-                    size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
-                    size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
-
-                    nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
-                }
-            }
-            else if (cnNodeType == L"SparseImageInput")
-            {
-                if (parameter.size() < 3 || parameter.size() > 4)
-                    RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
-                    size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
-                    size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
-
-                    nodePtr = m_net.CreateSparseInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
-                }
-            }
-#endif
-            if (OpIs(InputValue))
-            {
-                let isSparse = config(L"isSparse");
-                let isImage = config(L"isImage");
-                if (!isImage)
-                    node = New<InputValue<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"], isSparse);
-                else
-                    node = New<InputValue<ElemType>>(deviceId, nodeName, (size_t)config[L"imageWidth"], (size_t)config[L"imageHeight"], (size_t)config[L"imageChannels"], (size_t)config[L"numImages"], isSparse);
-            }
-#if 0
-            else if (LearnableParameter<ElemType>::TypeName() == cnNodeType)
-            {
-                if (parameter.size() < 1 || parameter.size() > 2)
-                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                    bool needGradient = node->GetOptionalParameter("needGradient", "true");
-
-                    nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
-
-                    nodePtr->NeedGradient() = needGradient;
-                }
-                else if (pass == ndlPassFinal)
-                {
-                    static int randomSeed = 1;
-                    std::string initString = node->GetOptionalParameter("init", "uniform");
-                    ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
-                    ElemType value = node->GetOptionalParameter("value", "0");
-
-                    msra::strfun::tolower_ascii(initString);
-                    if (initString == "fixedvalue")
-                        nodePtr->FunctionValues().SetValue(value);
-                    else if (initString == "uniform")
-                        m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
-                    else if (initString == "gaussian")
-                        m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
-                    else if (initString == "fromfile")
-                    {
-                        std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
-                        if (initFromFilePath == "")
-                            RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
-                        if (initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size() - 1] == '\"')
-                            // remove the opening and closing double quotes
-                            initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size() - 2);
-                        if (!fexists(initFromFilePath))
-                            RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                        m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
-                    }
-                    else
-                        RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
-                }
-            }
-            else if (SparseLearnableParameter<ElemType>::TypeName() == cnNodeType)
-            {
-                if (parameter.size() < 1 || parameter.size() > 2)
-                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                    bool needGradient = node->GetOptionalParameter("needGradient", "true");
-
-                    nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols);
-
-                    nodePtr->NeedGradient() = needGradient;
-                }
-                else if (pass == ndlPassFinal)
-                {
-                    static int randomSeed = 1;
-                    std::string initString = node->GetOptionalParameter("init", "uniform");
-                    ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
-                    ElemType value = node->GetOptionalParameter("value", "0");
-
-                    msra::strfun::tolower_ascii(initString);
-                    if (initString == "fixedvalue")
-                        nodePtr->FunctionValues().SetValue(value);
-                    else if (initString == "uniform")
-                        m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
-                    else if (initString == "gaussian")
-                        m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
-                    else if (initString == "fromfile")
-                    {
-                        std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
-                        if (initFromFilePath == "")
-                            RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
-                        if (initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size() - 1] == '\"')
-                            // remove the opening and closing double quotes
-                            initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size() - 2);
-                        if (!fexists(initFromFilePath))
-                            RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                        m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
-                    }
-                    else
-                        RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
-                }
-            }
-#endif
-            else if (OpIs(LearnableParameter) || OpIs(SparseLearnableParameter))
-            {
-                // parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
-                // TODO: do we need a default value mechanism? How to make sure it does not pop upwards? Current functions do not allow overloads.
-                // TODO: test this with random init for QuickE2E on CPU against SimpleNetworkBuilder
-                let isSparse = (operationName.find(L"Sparse") != wstring::npos);
-                if (!isSparse)
-                    node = New<LearnableParameter<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"]);
-                else
-                    node = New<SparseLearnableParameter<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"], 0/*size*/);    // TODO: what is size?
-                node->NeedGradient() = config[L"needGradient"];
-                static int randomSeed = 1;
-                wstring initString = config[L"init"];
-                if (initString == L"fixedValue")
-                    dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->FunctionValues().SetValue((ElemType)config[L"value"]);
-                else if (initString == L"uniform" || initString == L"gaussian")
-                {
-                    // TODO: add these options also to old NDL
-                    int forcedRandomSeed = config[L"randomSeed"];   // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
-                    dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->InitRandom((initString == L"uniform"), forcedRandomSeed < 0 ? (randomSeed++ + m_randomSeedOffset) : (unsigned long)forcedRandomSeed, config[L"initValueScale"], config[L"initOnCPUOnly"]);
-                }
-                else if (initString == L"fromFile")
-                {
-                    wstring initFromFilePath = config[L"initFromFilePath"];
-                    if (initFromFilePath.empty())
-                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
-                    ComputationNetwork::InitLearnableParametersFromFile(dynamic_pointer_cast<ComputationNode<ElemType>>(node), initFromFilePath, node->GetDeviceId());
-                }
-                else
-                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedValue|fromFile]");
-            }
-#if 0
-            else if (cnNodeType == L"Constant")
-            {
-                if (parameter.size() != 1)
-                    RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]].");
-
-                if (pass == ndlPassInitial)
-                {
-                    size_t rows = node->GetOptionalParameter("rows", "1");
-                    size_t cols = node->GetOptionalParameter("cols", "1");
-
-                    nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
-                    nodePtr->NeedGradient() = false;
-                }
-                else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
-                {
-                    ElemType val = parameter[0]->GetScalar();
-                    nodePtr->FunctionValues().SetValue(val);
-                }
-            }
-#endif
-            // Constant is implemented as a LearnableParameter with initializion as fixedValue with needGradient false, on script level
-#if 0
-            else if (cnNodeType == PastValueNode<ElemType>::TypeName() ||
-                cnNodeType == FutureValueNode<ElemType>::TypeName())
-            {
-                if (parameter.size() <2 || parameter.size() >3)
-                    RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
-
-                nodeParamCount = 1;
-                nodeParamStart = parameter.size() > 2 ? 2 : 1;
-
-                if (pass == ndlPassInitial)
-                {
-                    // evaluate only scalar parameters
-                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                    // if we have three parameters the second is columns
-                    size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                    bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                    float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1");
-
-                    //for backward compatibility we check timeStep first
-                    size_t timeStep = node->GetOptionalParameter("timeStep", "1");
-                    if (timeStep == 1)
-                    {
-                        timeStep = node->GetOptionalParameter("delayTime", "1");
-                    }
-
-                    if (cnNodeType == PastValueNode<ElemType>::TypeName())
-                    {
-                        nodePtr = m_net.PastValue(NULL, defaultHiddenActivity, rows, cols, name);
-                        static_pointer_cast<PastValueNode<ElemType>>(nodePtr)->SetTimeStep(timeStep);
-                    }
-                    else
-                    {
-                        nodePtr = m_net.FutureValue(NULL, defaultHiddenActivity, rows, cols, name);
-                        static_pointer_cast<FutureValueNode<ElemType>>(nodePtr)->SetTimeStep(timeStep);
-                    }
-
-                    nodePtr->NeedGradient() = needGradient; // TODO: What for?
-                }
-            }
-#endif
-            // nodes with delayed inputs, where we cannot yet resolve inputs due to circular references
-            else if (OpIs(PastValueNode) || OpIs(FutureValueNode)) // TODO: untested
-            {
-                // rows, cols, input, [timeStep=1, defaultHiddenActivation=0.1]
-                // Note: changed names of optional args compared to current NDL
-                // TODO: we really should NOT have to specify the dimensions; network builder can figure it out. Keep it for now, fix when it is time.
-                // We instantiate not the node directly, but a wrapped version that can cast to LateAttachingNode, which holds a lambda to complete the attachment process at the appropriate time.
-                function<void(ComputationNode<ElemType>*)> completeAttachInputs = [configp](ComputationNode<ElemType>* node)   // This is the lambda to complete the process. Note that config captured as a shared_ptr.
-                {
-                    node->AttachInputs(GetInputs(*configp));    // this is executed by network builder while iterating the nodes
-                };
-                if (OpIs(PastValueNode))
-                    node = New<LateAttachingNode<PastValueNode<ElemType>>>(deviceId, nodeName, completeAttachInputs, (ElemType)config[L"defaultHiddenActivation"], (size_t)config[L"rows"], (size_t)config[L"cols"], (size_t)config[L"timeStep"]);
-                else
-                    node = New<LateAttachingNode<FutureValueNode<ElemType>>>(deviceId, nodeName, completeAttachInputs, (ElemType)config[L"defaultHiddenActivation"], (size_t)config[L"rows"], (size_t)config[L"cols"], (size_t)config[L"timeStep"]);
-            }
-            else        // nodes with inputs
-            {
-                let inputs = GetInputs(config);
-                // second group: nodes with special initializers
-#if 0
-                /*else*/ if (cnNodeType == RowSliceNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() != 3)
-                        RuntimeError("RowSlice should have three parameters. Usage: RowSlice(startRowIndex, numRows, origNodeName.");
-
-                    nodeParamCount = 1;
-                    nodeParamStart = 2;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                        size_t start_index = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                        size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
-
-                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                        nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name);
-                        nodePtr->NeedGradient() = needGradient;
-                    }
-                }
-#endif
-                if (OpIs(RowSliceNode)) // TODO: untested
-                {
-                    // startIndex, numRows, inputs /*one*/, needGradient=false
-                    node = New<RowSliceNode<ElemType>>(deviceId, nodeName, (size_t)config[L"startIndex"], (size_t)config[L"numRows"]);
-                    node->NeedGradient() = config[L"needGradient"];
-                }
-#if 0
-                else if (cnNodeType == RowRepeatNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() != 2)
-                        RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats.");
-
-                    nodeParamCount = 1;
-                    nodeParamStart = 0;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                        size_t num_repeat = ((NDLNode<ElemType>*)params[1])->GetScalar();
-
-                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                        nodePtr = m_net.RowRepeat(NULL, num_repeat, name);
-                        nodePtr->NeedGradient() = needGradient;
-                    }
-                }
-#endif
-                else if (OpIs(RowRepeatNode)) // TODO: untested
-                {
-                    // inputs /*one*/, numRepeats, needGradient=false
-                    node = New<RowRepeatNode<ElemType>>(deviceId, nodeName, (size_t)config[L"numRepeats"]);
-                    node->NeedGradient() = config[L"needGradient"];
-                }
-#if 0
-                else if (cnNodeType == ReshapeNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() < 2 || parameter.size() > 5)
-                        RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
-
-                    nodeParamCount = 1;
-                    nodeParamStart = 0;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                        size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
-                        size_t img_width = node->GetOptionalParameter("imageWidth", "0");
-                        size_t img_height = node->GetOptionalParameter("imageHeight", "0");
-                        size_t img_channels = node->GetOptionalParameter("imageChannels", "0");
-
-                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                        nodePtr = m_net.Reshape(NULL, num_rows, img_width, img_height, img_channels, name);
-                        nodePtr->NeedGradient() = needGradient;
-                    }
-                }
-#endif
-                else if (OpIs(ReshapeNode)) // TODO: untested
-                {
-                    // inputs /*one*/, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0
-                    node = New<ReshapeNode<ElemType>>(deviceId, nodeName, (size_t)config[L"numRows"], (size_t)config[L"imageWidth"], (size_t)config[L"imageHeight"], (size_t)config[L"imageChannels"]);
-                    node->NeedGradient() = config[L"needGradient"];
-                    //nodePtr = m_net.Reshape(NULL, num_rows, img_width, img_height, img_channels, name);
-                    // BUGBUG: ^^ how to implement this?? We got no network here. What is this for?
-                    LogicError("ReshapeNode not working with BS because init code needs access to network which we don't haveyet--to be fixed elsewhere");
-                }
-#if 0
-                else if (cnNodeType == ConvolutionNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() != 7)
-                        RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str());
-
-                    // setup the parameter position of children so we can hook them up later
-                    nodeParamCount = 2;
-                    nodeParamStart = 0;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        int id = 2; // skip weightNode and inputValueNode
-
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-                        id = 0; // reset counter because the params array starts at zero
-                        size_t kernelWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t kernelHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-
-                        assert(id == 5);
-
-                        //optional
-                        bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
-                        size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
-
-
-                        nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
-                            horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
-                    }
-                }
-#endif
-                else if (OpIs(ConvolutionNode)) // TODO: untested
-                {
-                    // weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
-                    node = New<ConvolutionNode<ElemType>>(deviceId, nodeName, (size_t)config[L"kernelWidth"], (size_t)config[L"kernelHeight"], (size_t)config[L"outputChannels"],
-                                                                              (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"],
-                                                                              (bool)config[L"zeroPadding"], (size_t)config[L"maxTempMemSizeInSamples"]);
-                }
-#if 0
-                else if (cnNodeType == MaxPoolingNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() != 5)
-                        RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
-
-                    // setup the parameter position of children so we can hook them up later
-                    nodeParamCount = 1;
-                    nodeParamStart = 0;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        int id = 1; // skip inputValueNode
-
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-                        id = 0; // reset counter because the params array starts at zero
-                        size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-
-                        assert(id == 4);
-
-                        nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
-                            horizontalSubsample, verticalSubsample, name);
-                    }
-                }
-#endif
-                else if (OpIs(MaxPoolingNode)) // TODO: untested
-                {
-                    // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
-                    node = New<MaxPoolingNode<ElemType>>(deviceId, nodeName, (size_t)config[L"windowWidth"], (size_t)config[L"windowHeight"], (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"]);
-                }
-#if 0
-                else if (cnNodeType == AveragePoolingNode<ElemType>::TypeName())
-                {
-                    if (parameter.size() != 5)
-                        RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
-
-                    // setup the parameter position of children so we can hook them up later
-                    nodeParamCount = 1;
-                    nodeParamStart = 0;
-
-                    if (pass == ndlPassInitial)
-                    {
-                        int id = 1; // skip inputValueNode
-
-                        // evaluate only scalar parameters
-                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-                        id = 0; // reset counter because the params array starts at zero
-                        size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-
-                        assert(id == 4);
-
-                        nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
-                            horizontalSubsample, verticalSubsample, name);
-                    }
-                }
-#endif
-                else if (OpIs(AveragePoolingNode)) // TODO: untested
-                {
-                    // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
-                    node = New<AveragePoolingNode<ElemType>>(deviceId, nodeName, (size_t)config[L"windowWidth"], (size_t)config[L"windowHeight"], (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"]);
-                }
-                // last group: standard nodes that only take 'inputs'
-                else
-                {
-                    node = ComputationNetworkBuilder<ElemType>::NewStandardNode(operationName, deviceId, nodeName);
-                }
-                node->AttachInputs(inputs); // TODO: where to check the number of inputs? Should be a template parameter to ComputationNode!
-            }
-            // add a tag
-            let nodeWithTag = dynamic_pointer_cast<WithTag>(node);
-            if (nodeWithTag)
-                nodeWithTag->SetTag(config[L"tag"]);
-            // and done
-            return node;
-        }
-    private:
-        // helper for the factory function for ComputationNodes
-        static vector<ComputationNodeBasePtr> GetInputs(const IConfigRecord & config)
-        {
-            vector<ComputationNodeBasePtr> inputs;
-            let inputsArg = config[L"inputs"];
-            if (inputsArg.Is<ComputationNodeBase>())                // single arg
-                inputs.push_back(inputsArg);
-            else                                                    // a whole vector
-            {
-                ConfigArrayPtr inputsArray = (ConfigArrayPtr&)inputsArg;
-                let range = inputsArray->GetIndexRange();
-                for (int i = range.first; i <= range.second; i++)   // pull them. This will resolve all of them.
-                    inputs.push_back(inputsArray->At(i, inputsArg.GetLocation()));
-            }
-            return inputs;
-        }
-    };
-
-    // -------------------------------------------------------------------
-    // ComputationNetwork
-    // -------------------------------------------------------------------
-
-    // initialize a ComputationNetwork from a ConfigRecord
-    template<>
-    /*static*/ shared_ptr<Object> MakeRuntimeObject<ComputationNetwork>(const IConfigRecordPtr configp)
-    {
-        let & config = *configp;
-
-        DEVICEID_TYPE deviceId = (DEVICEID_TYPE)(int)config[L"deviceId"];
-        auto net = make_shared<ComputationNetwork>(deviceId);
-
-        auto & m_nameToNodeMap = net->GetNameToNodeMap();
-
-        deque<ComputationNodeBasePtr> workList;
-        // flatten the set of all nodes
-        // we collect all root ComputationNodes from the config record, and then expand into all their children by work-list processing
-        // TODO: This currently only collects nodes of the same ElemType. We could allow conversion operators.
-        // TODO: Can we even make the ComputationNetwork independent of ElemType?? As long as the nodes themselves are hooked up properly that should be OK!
-        for (let & id : config.GetMemberIds())
-        {
-            let & value = config[id];
-            if (value.Is<ComputationNodeBase>())
-                workList.push_back((ComputationNodeBasePtr&)value);
-        }
-        // process work list
-        // Also call FinalizeInit where we must.
-        while (!workList.empty())
-        {
-            let node = workList.front();
-            workList.pop_front();
-
-            // add to set
-            let res = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
-            if (!res.second)        // not inserted: we already got this one
-                if (res.first->second == node)
-                    continue;       // the same
-                else                // oops, a different node with the same name
-                    LogicError("ComputationNetwork: multiple nodes with the same NodeName() '%ls'", node->NodeName().c_str());
-
-            // If node derives from MustFinalizeInit() then it has unresolved inputs. Resolve them now.
-            // This may generate a whole new load of nodes, including nodes which in turn have late init.
-            // TODO: think this through whether it may generate circular references nevertheless
-            let lateAttachingNode = dynamic_pointer_cast<ILateAttachingNode>(node);
-            if (lateAttachingNode)
-                lateAttachingNode->LateAttachInputs();
-
-            // add it to the respective node group based on the tag
-            let nodeWithTag = dynamic_pointer_cast<WithTag>(node);
-            if (nodeWithTag)
-            {
-                wstring tag = nodeWithTag->GetTag();
-                if (tag == L"feature")                              net->FeatureNodes().push_back(node);
-                else if (tag == L"label")                           net->LabelNodes().push_back(node);
-                else if (tag == L"criterion" || tag == L"criteria") net->FinalCriterionNodes().push_back(node); // 'criteria' is wrong (plural); we keep it for compat
-                else if (!_wcsnicmp(tag.c_str(), L"eval", 4))       net->EvaluationNodes().push_back(node);     // eval*
-                else if (tag == L"output")                          net->OutputNodes().push_back(node);
-                else if (tag == L"pair")                            net->PairNodes().push_back(node);           // TODO: I made this up; the original code in SynchronousExecutionEngine did not have this
-                else if (tag == L"multiseq")                        net->NodesReqMultiSeqHandling().push_back(node);
-                else if (!tag.empty())
-                    RuntimeError("ComputationNetwork: unknown tag '%ls'", tag.c_str());
-                // TODO: are there nodes without tag? Where do they go?
-            }
-
-            // TODO: ...can we do stuff like propagating dimensions here? Or still too early?
-
-            // traverse children: append them to the end of the work list
-            let children = node->GetChildren();
-            for (auto child : children)
-                workList.push_back(child);  // (we could check whether c is in 'nodes' already here to optimize, but this way it is cleaner)
-        }
-
-        // TODO: what is missing is the dimensions
-#if 1
-        wstring args = net->ToString();
-        fprintf(stderr, "%ls\n", args.c_str());
-#endif
-        // these post-processing steps are done by the other network builders, but I don't know why they are necessary
-        net->FixupInputMinibatchSize();         // make sure dimensions are set up correctly
-        net->ResetEvalTimeStamp();              // (should not really be needed)
-        return net;
-    }
-
-    // creates the lambda for creating an object that can exist as 'float' or 'double'
-    // Pass both types as the two template args.
-    template<class Cfloat, class Cdouble>
-    static ConfigurableRuntimeType MakeRuntimeTypeConstructorDualPrecision()
-    {
-        ConfigurableRuntimeType rtInfo;
-        rtInfo.construct = [](const IConfigRecordPtr config)        // lambda to construct--this lambda can construct both the <float> and the <double> variant based on config parameter 'precision'
-        {
-            wstring precision = (*config)[L"precision"];            // dispatch on ElemType
-            if (precision == L"float")
-                return DualPrecisionHelpers<float, Cfloat>::MakeRuntimeObject(config);
-            else if (precision == L"double")
-                return DualPrecisionHelpers<double, Cdouble>::MakeRuntimeObject(config);
-            else
-                RuntimeError("invalid value for 'precision', must be 'float' or 'double'");
-        };
-        rtInfo.isConfigRecord = is_base_of<IConfigRecord, Cfloat>::value;
-        static_assert(is_base_of<IConfigRecord, Cfloat>::value == is_base_of<IConfigRecord, Cdouble>::value, "");   // we assume that both float and double have the same behavior
-        return rtInfo;
-    }
-
-    // and the regular one without ElemType dependency
-    template<class C>
-    static ConfigurableRuntimeType MakeRuntimeTypeConstructor()
-    {
-        ConfigurableRuntimeType rtInfo;
-        rtInfo.construct = [](const IConfigRecordPtr config)        // lambda to construct--this lambda can construct both the <float> and the <double> variant based on config parameter 'precision'
-        {
-            return MakeRuntimeObject<C>(config);
-        };
-        rtInfo.isConfigRecord = is_base_of<IConfigRecord, C>::value;
-        return rtInfo;
-    }
-
-#define DefineRuntimeType(T) { L ## #T, MakeRuntimeTypeConstructor<T>() }
-#define DefineRuntimeTypeDualPrecision(T) { L ## #T, MakeRuntimeTypeConstructorDualPrecision<T<float>,T<double>>() }
-
-    // get information about configurable runtime types
-    // This returns a ConfigurableRuntimeType structure which primarily contains a lambda to construct a runtime object from a ConfigRecord ('new' expression).
-    const ConfigurableRuntimeType * FindExternalRuntimeTypeInfo(const wstring & typeId)
-    {
-        // lookup table for "new" expression
-        // This table lists all C++ types that can be instantiated from "new" expressions, and gives a constructor lambda and type flags.
-        static map<wstring, ConfigurableRuntimeType> configurableRuntimeTypes =
-        {
-            // ComputationNodes
-            DefineRuntimeTypeDualPrecision(ComputationNode),
-            DefineRuntimeType(ComputationNetwork),
-#if 0
-            DefineRuntimeType(RecurrentComputationNode),
-            // In this experimental state, we only have Node and Network.
-            // Once BrainScript becomes the driver of everything, we will add other objects like Readers, Optimizers, and Actions here.
-#endif
-        };
-
-        // first check our own
-        let newIter = configurableRuntimeTypes.find(typeId);
-        if (newIter != configurableRuntimeTypes.end())
-            return &newIter->second;
-        return nullptr; // not found
-    }
-
-}}}
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    using namespace Microsoft::MSR;
-
    // helper that returns 'float' or 'double' depending on ElemType
-    template<typename ElemType> static const wchar_t * ElemTypeName();
+    template<class ElemType> static const wchar_t * ElemTypeName();
    template<> /*static*/ const wchar_t * ElemTypeName<float>()  { return L"float"; }
    template<> /*static*/ const wchar_t * ElemTypeName<double>() { return L"double"; }

    // build a ComputationNetwork from BrainScript source code
-    template<typename ElemType>
+    template<class ElemType>
    /*virtual*/ /*IComputationNetBuilder::*/ComputationNetwork* ExperimentalNetworkBuilder<ElemType>::BuildNetworkFromDescription(ComputationNetwork*)
    {
        if (!m_net || m_net->GetTotalNumberOfNodes() < 1) //not built yet
--- a/MachineLearning/CNTK/ExperimentalNetworkBuilder.h
+++ b/MachineLearning/CNTK/ExperimentalNetworkBuilder.h
@ -7,7 +7,7 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    template<typename ElemType>
+    template<class ElemType>
    class ExperimentalNetworkBuilder : public IComputationNetBuilder<ElemType>
    {
        typedef shared_ptr<ComputationNetwork> ComputationNetworkPtr;
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
@ -147,13 +147,13 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    bool ret = false;
    if (allowUndeterminedVariable)
        *allowUndeterminedVariable = true; // be default we allow undetermined variables
-    if (EqualInsensitive(nodeType, InputValue<ElemType>::TypeName(), L"Input"))
+    if (EqualInsensitive(nodeType, OperationNameOf(InputValue), L"Input"))
        ret = true;   
    else if (EqualInsensitive(nodeType, InputValue<ElemType>::SparseTypeName(), L"SparseInput"))
        ret = true; 
-    else if (EqualInsensitive(nodeType, LearnableParameter<ElemType>::TypeName(), L"Parameter"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter"))
        ret = true;   
-    //else if (EqualInsensitive(nodeType, SparseLearnableParameter<ElemType>::TypeName(), L"SparseParameter"))
+    //else if (EqualInsensitive(nodeType, OperationNameOf(SparseLearnableParameter), L"SparseParameter"))
    //    ret = true;  
    else if (EqualInsensitive(nodeType, L"Constant", L"Const"))
        ret = true;   
@ -161,115 +161,115 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
        ret = true;   
    else if (EqualInsensitive(nodeType, L"SparseImageInput", L"SparseImage"))
        ret = true;   
-    else if (EqualInsensitive(nodeType, SumElementsNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(SumElementsNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, SumColumnElementsNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(SumColumnElementsNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ScaleNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ScaleNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, TransposeNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(TransposeNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, TimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(TimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, TransposeTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(TransposeTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, StrideTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(StrideTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ElementTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ElementTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, RowElementTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(RowElementTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ColumnElementTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ColumnElementTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, DiagTimesNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(DiagTimesNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, CosDistanceNode<ElemType>::TypeName(), L"CosDist"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist"))
        ret = true;
-    else if (EqualInsensitive(nodeType, KhatriRaoProductNode<ElemType>::TypeName(), L"ColumnwiseCrossProduct"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(KhatriRaoProductNode), L"ColumnwiseCrossProduct"))
        ret = true;
-    else if (EqualInsensitive(nodeType, PlusNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(PlusNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, MinusNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(MinusNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, NegateNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(NegateNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, RectifiedLinearNode<ElemType>::TypeName(), L"ReLU"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(RectifiedLinearNode), L"ReLU"))
        ret = true;
-    else if (EqualInsensitive(nodeType, SigmoidNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(SigmoidNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, TanhNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(TanhNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ExpNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ExpNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, LogNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(LogNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, CosineNode<ElemType>::TypeName(), L"Cos"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos"))
        ret = true;
-    else if (EqualInsensitive(nodeType, SoftmaxNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(SoftmaxNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, LogSoftmaxNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(LogSoftmaxNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, SquareErrorNode<ElemType>::TypeName(), L"SE"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(SquareErrorNode), L"SE"))
        ret = true;
-    else if (EqualInsensitive(nodeType, CrossEntropyWithSoftmaxNode<ElemType>::TypeName(), L"CEWithSM"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyWithSoftmaxNode), L"CEWithSM"))
        ret = true;
-    else if (EqualInsensitive(nodeType, CrossEntropyNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName(), L"CBCEWithSM"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM"))
        ret = true;
-    else if (EqualInsensitive(nodeType, MatrixL1RegNode<ElemType>::TypeName(), L"L1Reg"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(MatrixL1RegNode), L"L1Reg"))
        ret = true;
-    else if (EqualInsensitive(nodeType, MatrixL2RegNode<ElemType>::TypeName(), L"L2Reg"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(MatrixL2RegNode), L"L2Reg"))
        ret = true;
-    else if (EqualInsensitive(nodeType, PerDimMeanVarNormalizationNode<ElemType>::TypeName(),L"PerDimMVNorm"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(PerDimMeanVarNormalizationNode), L"PerDimMVNorm"))
        ret = true;            
-    else if (EqualInsensitive(nodeType, PerDimMeanVarDeNormalizationNode<ElemType>::TypeName(),L"PerDimMVDeNorm"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(PerDimMeanVarDeNormalizationNode), L"PerDimMVDeNorm"))
        ret = true;            
-    else if (EqualInsensitive(nodeType, ErrorPredictionNode<ElemType>::TypeName(), L"ClassificationError"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ErrorPredictionNode), L"ClassificationError"))
        ret = true;    
-    else if (EqualInsensitive(nodeType, DropoutNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(DropoutNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ReshapeNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ReshapeNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, RowRepeatNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(RowRepeatNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, MeanNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(MeanNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, InvStdDevNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(InvStdDevNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, ConvolutionNode<ElemType>::TypeName(), L"Convolve"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve"))
        ret = true;   
-    else if (EqualInsensitive(nodeType, MaxPoolingNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(MaxPoolingNode)))
        ret = true;   
-    else if (EqualInsensitive(nodeType, AveragePoolingNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode)))
        ret = true;   
-    else if (EqualInsensitive(nodeType, PastValueNode<ElemType>::TypeName(), L"Delay"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(PastValueNode), L"Delay"))
        ret = true;
-    else if (EqualInsensitive(nodeType, FutureValueNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(FutureValueNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, RowSliceNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(RowSliceNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, RowStackNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(RowStackNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, LookupTableNode<ElemType>::TypeName()))
+    else if (EqualInsensitive(nodeType, OperationNameOf(LookupTableNode)))
        ret = true;
-    else if (EqualInsensitive(nodeType, GMMLogLikelihoodNode<ElemType>::TypeName(), L"GMMLL"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(GMMLogLikelihoodNode), L"GMMLL"))
        ret = true;
-    else if (EqualInsensitive(nodeType, CosDistanceWithNegativeSamplesNode<ElemType>::TypeName(), L"CosWithNegSamples"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples"))
        ret = true;
-    else if (EqualInsensitive(nodeType, TimeReverseNode<ElemType>::TypeName(), L"TimeReverse"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(TimeReverseNode), L"TimeReverse"))
        ret = true;
-    else if (EqualInsensitive(nodeType, CRFNode<ElemType>::TypeName(), L"CRF"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF"))
        ret = true;
-    else if (EqualInsensitive(nodeType, DummyCriterionNode<ElemType>::TypeName(), L"DummyCriterion"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(DummyCriterionNode), L"DummyCriterion"))
        ret = true;
-    else if (EqualInsensitive(nodeType, ParallelNode<ElemType>::TypeName(), L"Parallel"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(ParallelNode), L"Parallel"))
        ret = true;
-    else if (EqualInsensitive(nodeType, LSTMNode<ElemType>::TypeName(), L"LSTM"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(LSTMNode), L"LSTM"))
        ret = true;
-    else if (EqualInsensitive(nodeType, PairNetworkNode<ElemType>::TypeName(), L"PairNetwork"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(PairNetworkNode), L"PairNetwork"))
        ret = true;
-    else if (EqualInsensitive(nodeType, StrideTimesNode<ElemType>::TypeName(), L"StrideTimes"))
+    else if (EqualInsensitive(nodeType, OperationNameOf(StrideTimesNode), L"StrideTimes"))
        ret = true;

    // return the actual node name in the parameter if we found something
--- a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
@ -2468,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //output = builder.Softmax(output);
            //output = builder.Log(output);

-            scaledLogLikelihood = builder.CreateComputationNode(MinusNode<ElemType>::TypeName(), L"ScaledLogLikelihood");
+            scaledLogLikelihood = builder.CreateComputationNode(OperationNameOf(MinusNode), L"ScaledLogLikelihood");
            scaledLogLikelihood->AttachInputs(output, input);
            m_net->OutputNodes().push_back(scaledLogLikelihood);
        }
@ -2490,11 +2490,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        ComputationNodePtr output;
        wstring nonLinearFunction = m_nonLinearFunctions[layer];
-        if (nonLinearFunction == SigmoidNode<ElemType>::TypeName())
+        if (nonLinearFunction == OperationNameOf(SigmoidNode))
            output = builder.Sigmoid(input, nodeName);
-        else if (nonLinearFunction == RectifiedLinearNode<ElemType>::TypeName())
+        else if (nonLinearFunction == OperationNameOf(RectifiedLinearNode))
            output = builder.RectifiedLinear(input, nodeName);
-        else if (nonLinearFunction == TanhNode<ElemType>::TypeName())
+        else if (nonLinearFunction == OperationNameOf(TanhNode))
            output = builder.Tanh(input, nodeName);
        else if (nonLinearFunction == L"None" || nonLinearFunction == L"none" || nonLinearFunction == L"")
        {
--- a/MachineLearning/CNTK/SimpleNetworkBuilder.h
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.h
@ -254,6 +254,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return std::string(tag) == expectedTag;
        }

+        // this load function allows an alternative file format of an early internal predecessor of CNTK, internally called DBN.exe
        virtual ComputationNetwork* LoadNetworkFromFile(const wstring& modelFileName, bool forceLoad = true,
                                                        bool bAllowNoCriterion = false, ComputationNetwork* anotherNetwork = nullptr)
        {
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.cpp
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.cpp
@ -15,7 +15,7 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    template<typename ElemType>
+    template<class ElemType>
    void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
    {
        ComputationNetworkBuilder<ElemType> builder(m_net);
@ -55,7 +55,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }
        
-        if (InputValue<ElemType>::TypeName() == cnNodeType)
+        if (OperationNameOf(InputValue) == cnNodeType)
        {
            if (parameter.size() < 1 || parameter.size() > 2)
                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
@ -127,7 +127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr = builder.CreateSparseInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
            }
        }
-        else if (LearnableParameter<ElemType>::TypeName() == cnNodeType)
+        else if (OperationNameOf(LearnableParameter) == cnNodeType)
        {
            if (parameter.size() < 1 || parameter.size() > 2)
                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
@ -171,13 +171,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
                    if(!fexists(initFromFilePath))
                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                    dynamic_pointer_cast<LearnableParameter<ElemType>>(nodePtr)->InitFromFile(msra::strfun::utf16(initFromFilePath));
                }
                else
                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
            }
        }
-        else if (SparseLearnableParameter<ElemType>::TypeName() == cnNodeType)
+        else if (OperationNameOf(SparseLearnableParameter) == cnNodeType)
        {
            if (parameter.size() < 1 || parameter.size() > 2)
                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
@ -219,7 +219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
                    if(!fexists(initFromFilePath))
                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                    dynamic_pointer_cast<SparseLearnableParameter<ElemType>>(nodePtr)->InitFromFile(msra::strfun::utf16(initFromFilePath));
                }
                else
                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
@ -244,7 +244,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->FunctionValues().SetValue(val);
            }
        }
-        else if (cnNodeType == RowSliceNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(RowSliceNode))
        {
            if (parameter.size() != 3)
                RuntimeError("RowSlice should have three parameters. Usage: RowSlice(startRowIndex, numRows, origNodeName.");
@ -264,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->NeedGradient() = needGradient;
            }
        }
-        else if (cnNodeType == RowRepeatNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(RowRepeatNode))
        {
            if (parameter.size() != 2)
                RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats.");
@ -283,7 +283,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->NeedGradient() = needGradient;
            }
        }
-        else if (cnNodeType == ReshapeNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(ReshapeNode))
        {
            if (parameter.size() < 2 || parameter.size() > 5)
                RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
@ -305,8 +305,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->NeedGradient() = needGradient;
            }
        }
-        else if (cnNodeType == PastValueNode<ElemType>::TypeName() || 
-                 cnNodeType == FutureValueNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(PastValueNode) || 
+                 cnNodeType == OperationNameOf(FutureValueNode))
        {
            if (parameter.size() <2 || parameter.size() >3)
                RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
@ -332,7 +332,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    timeStep = node->GetOptionalParameter("delayTime", "1");
                }

-                if (cnNodeType == PastValueNode<ElemType>::TypeName())
+                if (cnNodeType == OperationNameOf(PastValueNode))
                {
                    nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, cols, name);
                    static_pointer_cast<PastValueNode<ElemType>>(nodePtr)->SetTimeStep(timeStep);
@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->NeedGradient() = needGradient;    // TODO: what's this for?
            }
        }    
-        else if (cnNodeType == ConvolutionNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(ConvolutionNode))
        {
            if (parameter.size() != 7)
                RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str());
@ -379,7 +379,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                              horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
            }
        }
-        else if (cnNodeType == MaxPoolingNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(MaxPoolingNode))
        {
            if (parameter.size() != 5)
                RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
@ -406,7 +406,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                             horizontalSubsample, verticalSubsample, name);
            }
        }
-        else if (cnNodeType == AveragePoolingNode<ElemType>::TypeName())
+        else if (cnNodeType == OperationNameOf(AveragePoolingNode))
        {
            if (parameter.size() != 5)
                RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
@ -457,7 +457,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
            std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);

-            if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
+            if (cnNodeType == OperationNameOf(RowStackNode)) //support variable length inputs
            {
                std::vector<ComputationNodeBasePtr> inputNodes;
                inputNodes.resize(inputs.size());
--- a/MachineLearning/CNTKComputationNetworkLib/CNTKComputationNetworkLib.vcxproj
+++ b/MachineLearning/CNTKComputationNetworkLib/CNTKComputationNetworkLib.vcxproj
@ -0,0 +1,197 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}</ProjectGuid>
+    <SccProjectName>
+    </SccProjectName>
+    <SccAuxPath>
+    </SccAuxPath>
+    <SccLocalPath>
+    </SccLocalPath>
+    <SccProvider>
+    </SccProvider>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CNTK</RootNamespace>
+    <ProjectName>CNTKComputationNetworkLib</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IncludePath>..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
+    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IncludePath>..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
+    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
+    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <OpenMPSupport>true</OpenMPSupport>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalIncludeDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <StackReserveSize>100000000</StackReserveSize>
+    </Link>
+    <PostBuildEvent>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Message>Copying NVidia GDK extension DLL to target folder</Message>
+    </PostBuildEvent>
+    <CustomBuildStep>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(TargetDir)config.txt;$(TargetDir)labels.txt;$(TargetDir)network.txt;$(TargetDir)NdlScript.txt</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <TreatOutputAsContent>true</TreatOutputAsContent>
+      <Message>Copy content files to target directory</Message>
+    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalIncludeDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <Profile>true</Profile>
+      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Message>Copying NVidia GDK extension DLL to target folder</Message>
+    </PostBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>
+      </Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <TreatOutputAsContent>true</TreatOutputAsContent>
+      <Message>
+      </Message>
+    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\CrossProcessMutex.h" />
+    <ClInclude Include="..\..\Common\Include\basetypes.h" />
+    <ClInclude Include="..\..\Common\Include\Basics.h" />
+    <ClInclude Include="..\..\Common\Include\BestGpu.h" />
+    <ClInclude Include="..\..\Common\Include\File.h" />
+    <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="..\..\Common\Include\nvml.h" />
+    <ClInclude Include="..\..\Common\Include\Platform.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
+    <ClInclude Include="..\..\Math\Math\Matrix.h" />
+    <ClInclude Include="CompositeComputationNodes.h" />
+    <ClInclude Include="ComputationNetwork.h" />
+    <ClInclude Include="ComputationNetworkBuilder.h" />
+    <ClInclude Include="ComputationNode.h" />
+    <ClInclude Include="ConvolutionalNodes.h" />
+    <ClInclude Include="DecoderNode.h" />
+    <ClInclude Include="EvaluationCriterionNodes.h" />
+    <ClInclude Include="InputAndParamNodes.h" />
+    <ClInclude Include="LinearAlgebraNodes.h" />
+    <ClInclude Include="MatrixPool.h" />
+    <ClInclude Include="NonlinearityNodes.h" />
+    <ClInclude Include="RecurrentNodes.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="TrainingCriterionNodes.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\BestGpu.cpp" />
+    <ClCompile Include="..\..\Common\File.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
+    <ClCompile Include="ComputationNetwork.cpp" />
+    <ClCompile Include="ComputationNetworkBuilder.cpp" />
+    <ClCompile Include="ComputationNode.cpp" />
+    <ClCompile Include="NetworkBuilderFromConfig.cpp" />
+    <ClCompile Include="stdafx.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/MachineLearning/CNTKComputationNetworkLib/CNTKComputationNetworkLib.vcxproj.filters
+++ b/MachineLearning/CNTKComputationNetworkLib/CNTKComputationNetworkLib.vcxproj.filters
@ -0,0 +1,129 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="ComputationNode.cpp">
+      <Filter>Nodes</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\BestGpu.cpp">
+      <Filter>GPU Interfacing</Filter>
+    </ClCompile>
+    <ClCompile Include="ComputationNetworkBuilder.cpp">
+      <Filter>Network</Filter>
+    </ClCompile>
+    <ClCompile Include="ComputationNetwork.cpp">
+      <Filter>Network</Filter>
+    </ClCompile>
+    <ClCompile Include="NetworkBuilderFromConfig.cpp">
+      <Filter>Experimental</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\Include\basetypes.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\fileutil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\File.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="ComputationNetwork.h">
+      <Filter>Network</Filter>
+    </ClInclude>
+    <ClInclude Include="ComputationNode.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\Basics.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\nvml.h">
+      <Filter>GPU Interfacing</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\BestGpu.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="CompositeComputationNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="EvaluationCriterionNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="TrainingCriterionNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="NonlinearityNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="LinearAlgebraNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="ConvolutionalNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="RecurrentNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="InputAndParamNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="DecoderNode.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\CrossProcessMutex.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\Platform.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="ComputationNetworkBuilder.h">
+      <Filter>Network</Filter>
+    </ClInclude>
+    <ClInclude Include="MatrixPool.h">
+      <Filter>Network</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Common">
+      <UniqueIdentifier>{b3d05c7b-7bcf-4b12-bcb5-dced86717202}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common\Include">
+      <UniqueIdentifier>{85226dda-87ba-4da6-af04-563d0ce23b94}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Nodes">
+      <UniqueIdentifier>{0b366814-48b2-4619-bf92-85ee24e3cbc1}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Misc">
+      <UniqueIdentifier>{3c119a92-ffb2-4850-adae-01778324974d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU Interfacing">
+      <UniqueIdentifier>{8d99b2cc-5209-40e4-8b4b-a7616973ae3b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Experimental">
+      <UniqueIdentifier>{fe2443a1-6323-449f-96be-cbd0f608f382}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Network">
+      <UniqueIdentifier>{498bb2e9-53de-4955-970e-813e3f21025b}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@ -596,24 +596,24 @@ public:
                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
        }

-        if (!(Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() &&
-              Inputs(2)->OperationName() == LearnableParameter<ElemType>::TypeName()) &&
-            !(Inputs(1)->OperationName() == MeanNode<ElemType>::TypeName() &&
-              Inputs(2)->OperationName() == InvStdDevNode<ElemType>::TypeName()))
+        if (!(Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) &&
+              Inputs(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
+            !(Inputs(1)->OperationName() == OperationNameOf(MeanNode) &&
+              Inputs(2)->OperationName() == OperationNameOf(InvStdDevNode)))
        {
            LogicError(
                "PerDimMeanVarNormalizationNode criterion requires the last two inputs to be LearnableParameter "
                "type or (Mean, InvStdDev) so that the values will be saved.");
        }

-        if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName())
+        if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter))
        {
            size_t rows = (Inputs(1)->FunctionValues().GetNumRows() == 0) ? Inputs(0)->FunctionValues().GetNumRows() :
                                                                            Inputs(1)->FunctionValues().GetNumRows();
            Inputs(1)->FunctionValues().Resize(rows, 1);
        }

-        if (Inputs(2)->OperationName() == LearnableParameter<ElemType>::TypeName())
+        if (Inputs(2)->OperationName() == OperationNameOf(LearnableParameter))
        {
            size_t rows = (Inputs(2)->FunctionValues().GetNumRows() == 0) ? Inputs(0)->FunctionValues().GetNumRows() :
                                                                            Inputs(2)->FunctionValues().GetNumRows();
@ -756,24 +756,24 @@ public:
                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
        }

-        if (!(Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() &&
-              Inputs(2)->OperationName() == LearnableParameter<ElemType>::TypeName()) &&
-            !(Inputs(1)->OperationName() == MeanNode<ElemType>::TypeName() &&
-              Inputs(2)->OperationName() == InvStdDevNode<ElemType>::TypeName()))
+        if (!(Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) &&
+              Inputs(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
+            !(Inputs(1)->OperationName() == OperationNameOf(MeanNode) &&
+              Inputs(2)->OperationName() == OperationNameOf(InvStdDevNode)))
        {
            throw std::logic_error(
                "PerDimMeanVarDeNormalizationNode criterion requires the last two inputs to be "
                "LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
        }

-        if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName())
+        if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter))
        {
            size_t rows = Inputs(1)->FunctionValues().GetNumRows() == 0 ? Inputs(0)->FunctionValues().GetNumRows() :
                                                                          Inputs(1)->FunctionValues().GetNumRows();
            Inputs(1)->FunctionValues().Resize(rows, 1);
        }

-        if (Inputs(2)->OperationName() == LearnableParameter<ElemType>::TypeName())
+        if (Inputs(2)->OperationName() == OperationNameOf(LearnableParameter))
        {
            size_t rows = Inputs(2)->FunctionValues().GetNumRows() == 0? Inputs(0)->FunctionValues().GetNumRows() :
                                                                                    Inputs(2)->FunctionValues().GetNumRows();
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@ -11,8 +11,8 @@
 #include "ComputationNetworkBuilder.h"  // used for load & save
 //#include "InputAndParamNodes.h"
 #include "LinearAlgebraNodes.h"
-//#include "NonlinearityNodes.h"
-//#include "ConvolutionalNodes.h"
+#include "NonlinearityNodes.h"
+#include "ConvolutionalNodes.h"
 #include "RecurrentNodes.h"
 //#include "DecoderNode.h"
 #include "TrainingCriterionNodes.h"
@ -238,7 +238,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
            {
                ComputationNodeBasePtr node = nodeIter->second;
-                if (node->OperationName() == LearnableParameter<float>::TypeName())
+                if (node->OperationName() == OperationNameOf(LearnableParameter))
                    node->NeedGradient() = needGradient;
            }
        }
@ -249,7 +249,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
            {
                ComputationNodeBasePtr node = (*nodeIter);
-                if (node->OperationName() == LearnableParameter<float>::TypeName())
+                if (node->OperationName() == OperationNameOf(LearnableParameter))
                    node->NeedGradient() = needGradient;
            }
        }
@ -257,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    // non-static version needed because it accesses m_randomSeedOffset
    // Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
-    template<typename ElemType> void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr node, const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly)
+    template<class ElemType> void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr node, const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly)
    {
        auto learnableParameterNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(node);
        learnableParameterNode->InitRandom(uniformInit, randomSeed + GetRandomSeedOffset(), initValueScale, initOnCPUOnly);
@ -266,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // FixupInputMinibatchSize - go through all the inputs and make sure they have a consistent minibatch size (after creation)
    void ComputationNetwork::FixupInputMinibatchSize()
    {
-        std::list<ComputationNodeBasePtr> inputs = GetNodesWithType(InputValue<float>::TypeName());
+        std::list<ComputationNodeBasePtr> inputs = GetNodesWithType(OperationNameOf(InputValue));
        int minibatchMax = 0;
        bool minibatchDifferent = false; // flag to see if all the values are already the same
        for (ComputationNodeBasePtr node : inputs)
@ -300,8 +300,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        for (auto ptr = recurrentNodes.begin(); ptr != recurrentNodes.end(); ptr++)
        {
            if ((*ptr)->IsFuncValueOlderThanInputs() && 
-                (*ptr)->OperationName() != PastValueNode<float>::TypeName() &&
-                (*ptr)->OperationName() != FutureValueNode<float>::TypeName())
+                (*ptr)->OperationName() != OperationNameOf(PastValueNode) &&
+                (*ptr)->OperationName() != OperationNameOf(FutureValueNode))
            {
                return true;
            }
@ -311,13 +311,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
    {
-        if (nodePtr->OperationName() == SquareErrorNode<float>::TypeName() ||
-            nodePtr->OperationName() == CrossEntropyWithSoftmaxNode<float>::TypeName() ||
-            nodePtr->OperationName() == CrossEntropyNode<float>::TypeName() ||
-            nodePtr->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<float>::TypeName() ||
-            nodePtr->OperationName() == ErrorPredictionNode<float>::TypeName() ||               
-            nodePtr->OperationName() == CRFNode<float>::TypeName() ||
-            nodePtr->OperationName() == DummyCriterionNode<float>::TypeName())
+        if (nodePtr->OperationName() == OperationNameOf(SquareErrorNode) ||
+            nodePtr->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode) ||
+            nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
+            nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
+            nodePtr->OperationName() == OperationNameOf(ErrorPredictionNode) ||               
+            nodePtr->OperationName() == OperationNameOf(CRFNode) ||
+            nodePtr->OperationName() == OperationNameOf(DummyCriterionNode))
            return true;

        return false;
@ -330,10 +330,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //SumElements node will generate a scalar value and so it should never require special handling
            //TransposeNode will change the size of columns and so it should also not included for special handling
            //their child node should instead
-            if (node->OperationName() != SumElementsNode<float>::TypeName() &&
-                node->OperationName() != TransposeNode<float>::TypeName() &&
-                node->OperationName() != MeanNode<float>::TypeName() &&
-                node->OperationName() != InvStdDevNode<float>::TypeName() 
+            if (node->OperationName() != OperationNameOf(SumElementsNode) &&
+                node->OperationName() != OperationNameOf(TransposeNode) &&
+                node->OperationName() != OperationNameOf(MeanNode) &&
+                node->OperationName() != OperationNameOf(InvStdDevNode) 
                )
                node->SetReqMultiSeqHandlingTo(true);
        }
@ -540,8 +540,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            visited.insert(cur);
            recStack.insert(cur);

-            if (cur->OperationName() != PastValueNode<float>::TypeName() && 
-                cur->OperationName() != FutureValueNode<float>::TypeName())
+            if (cur->OperationName() != OperationNameOf(PastValueNode) && 
+                cur->OperationName() != OperationNameOf(FutureValueNode))
            {
                for (size_t i = 0; i < cur->ChildrenSize(); i++)
                    if (cur->GetChildren()[i]->LoopId() == cur->LoopId())
@ -617,8 +617,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    for (size_t i = 0; i < nodeRecIter->ChildrenSize(); i++)
                    {
                        if (nodeRecIter->GetChildren()[i]->LoopId() == nodeRecIter->LoopId() && 
-                            nodeRecIter->OperationName() != PastValueNode<float>::TypeName() &&
-                            nodeRecIter->OperationName() != FutureValueNode<float>::TypeName())     // TODO: test for type RecurrentNode instead?
+                            nodeRecIter->OperationName() != OperationNameOf(PastValueNode) &&
+                            nodeRecIter->OperationName() != OperationNameOf(FutureValueNode))     // TODO: test for type RecurrentNode instead?
                        {
                            nodeRecIter->GetChildren()[i]->SetIndexInLoop(nodeRecIter->GetChildren()[i]->GetIndexInLoop() + 1);
                        }
@ -690,11 +690,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    ComputationNodeBasePtr nodeRecIter = recurrentInfo->m_recurrentNodes[j];

-                    if (nodeRecIter->OperationName() == PastValueNode<float>::TypeName())
+                    if (nodeRecIter->OperationName() == OperationNameOf(PastValueNode))
                    {
                        hasPastValueNode = true;
                    }
-                    else if (nodeRecIter->OperationName() == FutureValueNode<float>::TypeName())
+                    else if (nodeRecIter->OperationName() == OperationNameOf(FutureValueNode))
                    {
                        hasFutureValueNode = true;
                    }
@ -778,7 +778,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    nodeIter++)
            {
                ComputationNodeBasePtr node = (*nodeIter);
-                if (node->OperationName() == InputValue<float>::TypeName() /*L"InputValue"*/ ||
+                if (node->OperationName() == OperationNameOf(InputValue) /*L"InputValue"*/ ||
                    node->OperationName() == InputValue<float>::SparseTypeName())
                {
                    inputs.push_back(node);
@ -798,8 +798,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
            {
                ComputationNodeBasePtr node = (*nodeIter);
-                if ((node->OperationName() == LearnableParameter<float>::TypeName() && node->NeedGradient()) ||
-                    (node->OperationName() == SparseLearnableParameter<float>::TypeName() && node->NeedGradient()))
+                if ((node->OperationName() == OperationNameOf(LearnableParameter) && node->NeedGradient()) ||
+                    (node->OperationName() == OperationNameOf(SparseLearnableParameter) && node->NeedGradient()))
                {
                    learnableParameterNames.push_back(node->NodeName());
                }
@ -816,11 +816,55 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    /*static*/void ComputationNetwork::UpdateEvalTimeStamps(const std::vector<ComputationNodeBasePtr> & nodes)
+    {
+        for (size_t i = 0; i<nodes.size(); i++)
+            nodes[i]->UpdateEvalTimeStamp();
+    }
+
+    template<class ElemType>
+    /*static*/void ComputationNetwork::SetDropoutRate(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed)
+    {
+        if (dropoutRate != prevDropoutRate)
+        {
+            fprintf(stderr, "Switching dropout rate to %.8g.\n", dropoutRate);
+            std::list<ComputationNodeBasePtr> dropoutNodes = net.GetNodesWithType(OperationNameOf(DropoutNode), criterionNode);
+            if (dropoutNodes.size() == 0 && dropoutRate > 0)
+                fprintf(stderr, "WARNING: there is no dropout node.\n");
+            else for (auto nodeIter = dropoutNodes.begin(); nodeIter != dropoutNodes.end(); nodeIter++)
+            {
+                auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(*nodeIter);
+                node->SetDropoutRate(dropoutRate);
+                node->SetRandomSeed(dropOutSeed++);
+            }
+
+            prevDropoutRate = dropoutRate;
+        }
+    }
+
+    /*static*/void ComputationNetwork::SetMaxTempMemSizeForCNN(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const size_t maxTempMemSizeInSamples)
+    {
+        fprintf(stderr, "Set Max Temp Mem Size For Convolution Nodes to %lu samples.\n", maxTempMemSizeInSamples);
+        std::list<ComputationNodeBasePtr> convolutionNodes = net.GetNodesWithType(OperationNameOf(ConvolutionNode), criterionNode);
+        if (convolutionNodes.size() == 0 && maxTempMemSizeInSamples != 0)
+        {
+            fprintf(stderr, "WARNING: there is no convolution node.\n");
+        }
+        else
+        {
+            for (auto nodeIter = convolutionNodes.begin(); nodeIter != convolutionNodes.end(); nodeIter++)
+            {
+                auto node = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
+                node->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
+            }
+        }
+    }
+
    // -----------------------------------------------------------------------
    // serialization
    // -----------------------------------------------------------------------

-    template<typename ElemType> void ComputationNetwork::LoadFromFile(const std::wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork)
+    template<class ElemType> void ComputationNetwork::LoadFromFile(const std::wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork)
    {
        ClearNet();

@ -880,7 +924,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (int j = 0; j < numChildren; j++)
                    childrenNodes[j] = GetNodeFromName(childrenNames[j], anotherNetwork);

-                if (nodePtr->OperationName() == RowStackNode<float>::TypeName()) {
+                if (nodePtr->OperationName() == OperationNameOf(RowStackNode)) {
                    //allow for variable input nodes
                    nodePtr->AttachInputs(childrenNodes);
                }
@ -1070,7 +1114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        std::vector<ComputationNodeBasePtr> pastValueNodes;
        for (auto n : allnodes)
        {
-            if (n->OperationName() == PastValueNode<float>::TypeName() || n->OperationName() == L"Delay")
+            if (n->OperationName() == OperationNameOf(PastValueNode) || n->OperationName() == L"Delay")
                pastValueNodes.push_back(n);
        }

@ -1078,14 +1122,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        std::vector<ComputationNodeBasePtr> futureValueNodes;
        for (auto n : allnodes)
        {
-            if (n->OperationName() == FutureValueNode<float>::TypeName())
+            if (n->OperationName() == OperationNameOf(FutureValueNode))
                futureValueNodes.push_back(n);
        }
        // get learnableParameters
        std::vector<ComputationNodeBasePtr> learnableParameters;
        for (auto n : allnodes)
        {
-            if (n->OperationName() == LearnableParameter<float>::TypeName())
+            if (n->OperationName() == OperationNameOf(LearnableParameter))
                learnableParameters.push_back(n);
        }

@ -1173,7 +1217,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::wstring srcname = src->GetName();
            std::wstring desname = des->GetName();

-            if (des->OperationName() == PastValueNode<float>::TypeName() || des->OperationName() == L"Delay")
+            if (des->OperationName() == OperationNameOf(PastValueNode) || des->OperationName() == L"Delay")
            {
                // special treament for arc with PastValue node as the children
                // create a dummy node
@ -1185,7 +1229,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                line = out;
                line += msra::strfun::wstrprintf(L"\"%ls\" -> \"%ls\" ; \n", dummyName.c_str(), srcname.c_str());
            }
-            else if (des->OperationName() == FutureValueNode<float>::TypeName())
+            else if (des->OperationName() == OperationNameOf(FutureValueNode))
            {
                // special treament for arc with FutureValue node as the children
                // create a dummy node
@ -1237,7 +1281,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------

    // This function performs SVD decomposition for different groups of learnable  parameters
-    template<typename ElemType> void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDConfig)
+    template<class ElemType> void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDConfig)
    {
        vector<pair<vector<wstring>, float>> nodeGroups;
        wregex NameFilter;
@ -1386,9 +1430,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
    template void ComputationNetwork::LoadFromFile<float>(const std::wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
    template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig);
+    template /*static*/void ComputationNetwork::SetDropoutRate<float>(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);

    template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
    template void ComputationNetwork::LoadFromFile<double>(const std::wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
    template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig);
+    template /*static*/void ComputationNetwork::SetDropoutRate<double>(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);

 }}}
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@ -216,7 +216,6 @@ public:
    // serialization
    // -----------------------------------------------------------------------

-    // TODO: how does the file distinguish float vs double nodes?
    void SaveToFile(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary) const;
 private:
    void SaveToFileImpl(const std::wstring& fileName, const FileOptions fileFormat) const;
@ -224,7 +223,9 @@ public:

    void LoadPersistableParametersFromFile(const std::wstring& fileName, const bool requireValidation = true,
                                           const FileOptions fileFormat = FileOptions::fileOptionsBinary);
-    template<typename ElemType>
+    // design BUGBUG: binary files do not know whether they are float or double.
+    // TODO: modify file format to know this; then eliminate the <ElemType> dependency (and in some future, allow nodes to be different)
+    template<class ElemType>
    void LoadFromFile(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
                      const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);

@ -270,103 +271,13 @@ public:
            return numAllSamples;
    }

-    // -----------------------------------------------------------------------
-    // serialization
-    // -----------------------------------------------------------------------
-
-    // Read a matrix stored in text format from 'filePath' (whitespace-separated columns, newline-separated rows),
-    // and return a flat array containing the contents of this file in column-major format.
-    // filePath: path to file containing matrix in text format.
-    // numRows/numCols: after this function is called, these parameters contain the number of rows/columns in the matrix.
-    // returns: a flat array containing the contents of this file in column-major format
-    // NOTE: caller is responsible for deleting the returned buffer once it is finished using it.
-    // TODO: change to return a std::vector<ElemType>; solves the ownership issue
-    // TODO: move this elsewhere, this is a general utility function that does not belong into the ComputationNetwork class
-    template<class ElemType>
-    static ElemType* LoadArrayFromTextFile(const std::string filePath, size_t& numRows, size_t& numCols)
-    {
-        size_t r = 0;
-        size_t numColsInFirstRow = 0;
-
-        // NOTE: Not using the Microsoft.MSR.CNTK.File API here because it
-        // uses a buffer of fixed size, which doesn't allow very long rows.
-        // See fileutil.cpp fgetline method (std::string fgetline (FILE * f) { fixed_vector<char> buf (1000000); ... })
-        std::ifstream myfile(filePath);
-
-        // load matrix into vector of vectors (since we don't know the size in advance).
-        std::vector<std::vector<ElemType>> elements;
-        if (myfile.is_open())
-        {
-            std::string line;
-            while (std::getline(myfile, line))
-            {
-                // Break on empty line.  This allows there to be an empty line at the end of the file.
-                if (line == "")
-                    break;
-
-                istringstream iss(line);
-                ElemType element;
-                int numElementsInRow = 0;
-                elements.push_back(std::vector<ElemType>());
-                while (iss >> element)
-                {
-                    elements[r].push_back(element);
-                    numElementsInRow++;
-                }
-
-                if (r == 0)
-                    numColsInFirstRow = numElementsInRow;
-                else if (numElementsInRow != numColsInFirstRow)
-                    RuntimeError("The rows in the provided file do not all have the same number of columns: " + filePath);
-
-                r++;
-            }
-            myfile.close();
-        }
-        else
-            RuntimeError("Unable to open file");
-
-        numRows = r;
-        numCols = numColsInFirstRow;
-
-        ElemType* pArray = new ElemType[numRows * numCols];
-
-        // Perform transpose when copying elements from vectors to ElemType[],
-        // in order to store in column-major format.
-        for (int i = 0; i < numCols; i++)
-        {
-            for (int j = 0; j < numRows; j++)
-                pArray[i * numRows + j] = elements[j][i];
-            }
-
-        return pArray;
-    }
-
-    // TODO: why is this here? Move to LearnableParameter class?
-    template<class ElemType>
-    static void InitLearnableParametersFromFile(const shared_ptr<ComputationNode<ElemType>> node,
-                                                const std::wstring & initFromFilePath,
-                                                DEVICEID_TYPE deviceId)    // TODO: why not just use node->m_deviceId?
-    {
-        size_t numRows = 0;
-        size_t numCols = 0;
-        ElemType *pArray = LoadArrayFromTextFile<ElemType>(msra::strfun::utf8(initFromFilePath), numRows, numCols); // TODO: change pathname to wstring
-        node->FunctionValues().SetValue(numRows, numCols, pArray, matrixFlagNormal, deviceId);
-        delete[] pArray;    // TODO: use std::vector to avoid mem leak on error
-    }
-    template<class ElemType>
-    void InitLearnableParametersFromFile(const shared_ptr<ComputationNode<ElemType>> node, const std::string & initFromFilePath)   // TODO: remove this method or change pathname to wstring
-    {
-        InitLearnableParametersFromFile(node, msra::strfun::utf16(initFromFilePath), this->GetDeviceID());
-    }
-
    // -----------------------------------------------------------------------
    // node construction
    // -----------------------------------------------------------------------

    // non-static version needed because it accesses m_randomSeedOffset
    // Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
-    template<typename ElemType>
+    template<class ElemType>
    void InitLearnableParameters(const ComputationNodeBasePtr node,
                                 const bool uniformInit,
                                 const unsigned long randomSeed,
@ -679,6 +590,9 @@ public:

    void SetNodesReqMultiSeqHandling();

+    // MAIN ENTRY POINT for evaluation (forward prop)
+    // TODO: pass a set of nodes instead of only one
+    // TODO: rename to ForwardProp()? To make it very clear?
    void Evaluate(const ComputationNodeBasePtr rootNode)
    {
        BuildAndValidateNetwork(rootNode);
@ -793,7 +707,9 @@ public:
        }
    }

-    template<typename ElemType>
+    // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
+    // TODO: pass a set of nodes instead of only one
+    template<class ElemType>
    void ComputeGradient(const ComputationNodeBasePtr rootNode, 
                         bool bResetToOne = true,  /// true if reset the gradient of rootnode to 1.0
                         const Matrix<ElemType>* rootGradientInitValue = nullptr,
@ -807,6 +723,7 @@ public:
        //run forward pass first
        Evaluate(rootNode);

+        // TODO: comment what the purpose of this is
        if (bClearGradient)
            ClearGradientForAllNodes(rootNode);

@ -872,6 +789,12 @@ public:
        }
    }

+    // a few more helpers
+    static void UpdateEvalTimeStamps(const std::vector<ComputationNodeBasePtr> & nodes);
+    template<class ElemType> // TODO: dropoutRate change to double
+    static void SetDropoutRate(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
+    static void SetMaxTempMemSizeForCNN(ComputationNetwork& net, const ComputationNodeBasePtr criterionNode, const size_t maxTempMemSizeInSamples);
+
    // -----------------------------------------------------------------------
    // network editing
    // -----------------------------------------------------------------------
@ -1389,7 +1312,7 @@ public:
    // B and C are two learnable parameters
    //========================================
    // BUGBUG: this only currently works for one ElemType, not both
-    template<typename ElemType>
+    template<class ElemType>
    void PerformSVDecomposition(const map<wstring, float>& SVDConfig);

 public:
@ -1398,7 +1321,7 @@ public:
    // -----------------------------------------------------------------------

    // TODO: make these templated on <ElemType> locally
-    template<typename ElemType>
+    template<class ElemType>
    void GetHistory(map<wstring, Matrix<ElemType>>& history, bool bLastTime = false)
    {
        //put all node info first
@ -1411,7 +1334,7 @@ public:
        }
    };

-    template<typename ElemType>
+    template<class ElemType>
    void SetHistory(map<wstring, Matrix<ElemType>>& history)
    {
        //put all node info first
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -0,0 +1,559 @@
+// ComputationNetworkBuilder -- helper class for constructing ComputationNetworks and ComputationNodes from C++ (internal and external)
+//
+// <copyright file="ComputationNode.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+
+#include "Basics.h"
+#include "ComputationNetworkBuilder.h"
+
+#include "ComputationNode.h"
+#include "InputAndParamNodes.h"
+#include "LinearAlgebraNodes.h"
+#include "NonlinearityNodes.h"
+#include "ConvolutionalNodes.h"
+#include "RecurrentNodes.h"
+#include "DecoderNode.h"
+#include "TrainingCriterionNodes.h"
+#include "CompositeComputationNodes.h"
+#include "EvaluationCriterionNodes.h"
+
+#include <string>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    using namespace std;
+
+    // create a new node of a type given as a string, with var args so that this can be used at multiple places
+    // This function only creates nodes that accept (m_deviceId, nodeName).
+    template<class ElemType>
+    /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring & nodeType, DEVICEID_TYPE deviceId, const wstring & name)
+    {
+        // please keep this table sorted
+        if (nodeType == OperationNameOf(CRFNode))	return New<CRFNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode)) return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ColumnElementTimesNode))  return New<ColumnElementTimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(CosDistanceNode))	    return New<CosDistanceNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(CosDistanceWithNegativeSamplesNode)) return New<CosDistanceWithNegativeSamplesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(CosineNode))	            return New<CosineNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(CrossEntropyNode))	    return New<CrossEntropyNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(CrossEntropyWithSoftmaxNode))	return New<CrossEntropyWithSoftmaxNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(DiagTimesNode))	    return New<DiagTimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(DropoutNode))	            return New<DropoutNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(DummyCriterionNode))	    return New<DummyCriterionNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ElementTimesNode))	    return New<ElementTimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ErrorPredictionNode))	    return New<ErrorPredictionNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ExpNode))	            return New<ExpNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(FutureValueNode))	    return New<FutureValueNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(GMMLogLikelihoodNode))      return New<GMMLogLikelihoodNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(InvStdDevNode))	            return New<InvStdDevNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(KhatriRaoProductNode))      return New<KhatriRaoProductNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(LSTMNode))	            return New<LSTMNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(LogNode))	            return New<LogNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(LogSoftmaxNode))	    return New<LogSoftmaxNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(LookupTableNode))	    return New<LookupTableNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(MatrixL1RegNode))	    return New<MatrixL1RegNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(MatrixL2RegNode))	    return New<MatrixL2RegNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(MeanNode))	            return New<MeanNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(MinusNode))	            return New<MinusNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(NegateNode))	            return New<NegateNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(NoiseContrastiveEstimationNode)) return New<NoiseContrastiveEstimationNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(PairNetworkNode))	    return New<PairNetworkNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ParallelNode))	    return New<ParallelNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(PastValueNode) || nodeType == L"Delay") return New<PastValueNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode) || nodeType == L"PerDimMeanVarDeNormalizationNode")	return New<PerDimMeanVarDeNormalizationNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode) || nodeType == L"PerDimMeanVarNormalizationNode")	return New<PerDimMeanVarNormalizationNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(PlusNode))	            return New<PlusNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(RectifiedLinearNode))	    return New<RectifiedLinearNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ReshapeNode))	            return New<ReshapeNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(RowElementTimesNode))	    return New<RowElementTimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(RowRepeatNode))	    return New<RowRepeatNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(RowSliceNode))	    return New<RowSliceNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(RowStackNode))	    return New<RowStackNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ScaleNode))	            return New<ScaleNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SequenceDecoderNode))	    return New<SequenceDecoderNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SigmoidNode))	            return New<SigmoidNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SoftmaxNode))	            return New<SoftmaxNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SquareErrorNode))	    return New<SquareErrorNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(StrideTimesNode))	    return New<StrideTimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SumColumnElementsNode))   return New<SumColumnElementsNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SumElementsNode))	    return New<SumElementsNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(TanhNode))	            return New<TanhNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(TimeReverseNode))	    return New<TimeReverseNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(TimesNode))	            return New<TimesNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(TransposeNode))	    return New<TransposeNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(TransposeTimesNode))	    return New<TransposeTimesNode<ElemType>>(deviceId, name);
+        else return nullptr;
+    }
+
+    // create a new node of a type given as a string, with var args so that this can be used at multiple places
+    // This function is used for loading, while the above is used for creating standard-type networks.
+    template<class ElemType>
+    /*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode(const std::wstring & nodeType, DEVICEID_TYPE deviceId, const wstring & name)
+    {
+        // TODO: Is this ever called with additional _Args? If not, simplify
+        // try first those that accept the standard two constructor arguments
+        auto newNode = NewStandardNode(nodeType, deviceId, name);
+        if (newNode) return newNode;
+        // check more types
+        else if (nodeType == OperationNameOf(AveragePoolingNode))	     return New<AveragePoolingNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(ConvolutionNode))	     return New<ConvolutionNode<ElemType>>(deviceId, name);
+        else if (nodeType == InputValue<ElemType>::SparseTypeName())	     return New<InputValue<ElemType>>(deviceId, name, true);
+        else if (nodeType == OperationNameOf(InputValue))	             return New<InputValue<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(LearnableParameter))	     return New<LearnableParameter<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(MaxPoolingNode))	     return New<MaxPoolingNode<ElemType>>(deviceId, name);
+        else if (nodeType == OperationNameOf(SparseLearnableParameter)) return New<SparseLearnableParameter<ElemType>>(deviceId, name);
+        else return nullptr;
+    }
+
+    // -----------------------------------------------------------------------
+    // node creation
+    // -----------------------------------------------------------------------
+
+    // The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
+    // There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
+    // TODO: Do we really need these? Folks who want to use C++ can instead say net->AddNodeToNet(New<>(...)), which is not that different.
+    // TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols)
+    {
+        // TODO: in SimpleNetworkBuilder, this is very often followed by InitLearnableParameter()--we should have an overload that just does it right away
+        return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceID(), paramName, rows, cols));
+    }
+
+    //sparse matrix size is optionally specified
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size)
+    {
+        return net.AddNodeToNetWithElemType(New<SparseLearnableParameter<ElemType>>(net.GetDeviceID(), paramName, rows, cols, size));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
+    {
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, rows, cols));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
+    {
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, rows, cols, true));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName,
+        const size_t imageWidth,
+        const size_t imageHeight,
+        const size_t imageChannels,
+        const size_t numImages)
+    {
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, imageWidth, imageHeight, imageChannels, numImages));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName,
+        const size_t imageWidth,
+        const size_t imageHeight,
+        const size_t imageChannels,
+        const size_t numImages)
+    {
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceID(), inputName, imageWidth, imageHeight, imageChannels, numImages, true));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols)
+    {
+        return net.AddNodeToNetWithElemType(New<PairNetworkNode<ElemType>>(net.GetDeviceID(), inputName, rows, cols));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring & nodeName,
+                                                                            const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
+                                                                            const size_t horizontalSubsample, const size_t verticalSubsample,
+                                                                            const bool zeroPadding,
+                                                                            const size_t maxTempMemSizeInSamples)
+    {
+        return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceID(), nodeName,
+            kernelWidth, kernelHeight,
+            outputChannels,
+            horizontalSubsample,
+            verticalSubsample, zeroPadding,
+            maxTempMemSizeInSamples));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring & nodeName,
+        const size_t windowWidth,
+        const size_t windowHeight,
+        const size_t horizontalSubsample,
+        const size_t verticalSubsample)
+    {
+        return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
+            windowWidth, windowHeight,
+            horizontalSubsample,
+            verticalSubsample));
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth,
+        const size_t windowHeight, const size_t horizontalSubsample,
+        const size_t verticalSubsample)
+    {
+        return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
+            windowWidth, windowHeight,
+            horizontalSubsample,
+            verticalSubsample));
+    }
+
+    // this is the catch-all for all cases not covered as special cases above
+    // Unlike the specialized ones above, this one creates nodes by type given as a string.
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateComputationNode(const std::wstring & nodeType, const std::wstring & nodeName)
+    {
+        return net.AddNodeToNetWithElemType(NewStandardNode(nodeType, net.GetDeviceID(), nodeName));
+    }
+
+    // -----------------------------------------------------------------------
+    // node creation
+    // -----------------------------------------------------------------------
+
+    // The following functions create nodes and link them to the network and their inputs.
+    // TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PairNetwork(const ComputationNodePtr & a, const std::wstring nodeName)
+    {
+        if (net.GetNodeFromName(a->NodeName(), nullptr, false) != nullptr)
+        {
+            fprintf(stderr, "PairNetwork: asked to pair a node with name %ls in another network. However, this network has already a node with the same name. Should avoid this case.\n", a->NodeName().c_str());
+            RuntimeError("PairNetwork: asked to pair a node with name in another network. However, this network has already a node with the same name. Should avoid this case.\n");
+        }
+        return net.AddNodeToNetAndAttachInputs(New<PairNetworkNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
+        const ComputationNodePtr inputValues,
+        const size_t kernelWidth,
+        const size_t kernelHeight,
+        const size_t outputChannels,
+        const size_t horizontalSubsample,
+        const size_t verticalSubsample,
+        const bool zeroPadding,
+        const std::wstring nodeName,
+        const size_t maxTempMemSizeInSamples)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceID(), nodeName,
+            kernelWidth, kernelHeight,
+            outputChannels,
+            horizontalSubsample,
+            verticalSubsample, zeroPadding,
+            maxTempMemSizeInSamples),
+            weight, inputValues);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
+        const size_t windowWidth,
+        const size_t windowHeight,
+        const size_t horizontalSubsample,
+        const size_t verticalSubsample,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<MaxPoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
+            windowWidth, windowHeight,
+            horizontalSubsample,
+            verticalSubsample),
+            inputValues);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::AveragePooling(const ComputationNodePtr inputValues,
+        const size_t windowWidth,
+        const size_t windowHeight,
+        const size_t horizontalSubsample,
+        const size_t verticalSubsample,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<AveragePoolingNode<ElemType>>(net.GetDeviceID(), nodeName,
+            windowWidth, windowHeight,
+            horizontalSubsample,
+            verticalSubsample),
+            inputValues);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ErrorPredictionNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean,
+        const ComputationNodePtr InvStdDev, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<PerDimMeanVarNormalizationNode<ElemType>>(net.GetDeviceID(), nodeName), feature, mean, InvStdDev);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean,
+        const ComputationNodePtr InvStdDev, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<PerDimMeanVarDeNormalizationNode<ElemType>>(net.GetDeviceID(), nodeName), feature, mean, InvStdDev);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::SquareError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<SquareErrorNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::SequenceDecoder(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr pairscore, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<SequenceDecoderNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction, pairscore);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName)
+
+    {
+        return net.AddNodeToNetAndAttachInputs(New<CrossEntropyWithSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction,
+        const ComputationNodePtr input_weight,
+        const ComputationNodePtr input_bias, const std::wstring nodeName,
+        NCEEvalMode mode)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<NoiseContrastiveEstimationNode<ElemType>>(net.GetDeviceID(), nodeName, mode), label, prediction, input_weight, input_bias);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ClassCrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction,
+        const ComputationNodePtr input_weight,
+        const ComputationNodePtr cls_log_post_prob,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction, input_weight, cls_log_post_prob);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CRF(const ComputationNodePtr label,
+        const ComputationNodePtr postDepScore,
+        const ComputationNodePtr transition_score,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<CRFNode<ElemType>>(net.GetDeviceID(), nodeName), label, postDepScore, transition_score);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<DummyCriterionNode<ElemType>>(net.GetDeviceID(), nodeName), objectives, derivatives, prediction);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LSTM(const ComputationNodePtr obs,
+        const ComputationNodePtr inputGate,
+        const ComputationNodePtr forgetGate,
+        const ComputationNodePtr outputGate,
+        const ComputationNodePtr memoryCellWgt,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<LSTMNode<ElemType>>(net.GetDeviceID(), nodeName), obs, inputGate, forgetGate, outputGate, memoryCellWgt);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<CrossEntropyNode<ElemType>>(net.GetDeviceID(), nodeName), label, prediction);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MatrixL1Reg(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<MatrixL1RegNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MatrixL2Reg(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<MatrixL2RegNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Mean(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<MeanNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::InvStdDev(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<InvStdDevNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Negate(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<NegateNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RectifiedLinear(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<RectifiedLinearNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sigmoid(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<SigmoidNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Tanh(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<TanhNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Exp(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ExpNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Log(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<LogNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Cos(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<CosineNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Softmax(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<SoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<LogSoftmaxNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sum(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<SumElementsNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ScaleNode<ElemType>>(net.GetDeviceID(), nodeName), scalar, matrix);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Transpose(const ComputationNodePtr matrix, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<TransposeNode<ElemType>>(net.GetDeviceID(), nodeName), matrix);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Times(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<TimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::TransposeTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<TransposeTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<RowElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ColumnElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ColumnElementTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::StrideTimes(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<StrideTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b, c);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<DiagTimesNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<CosDistanceNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<KhatriRaoProductNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<PlusNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Minus(const ComputationNodePtr a,
+        const ComputationNodePtr b,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<MinusNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Dropout(const ComputationNodePtr a, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<DropoutNode<ElemType>>(net.GetDeviceID(), nodeName), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reshape(const ComputationNodePtr a,
+        const size_t num_rows,
+        const size_t img_width,
+        const size_t img_height,
+        const size_t img_channels,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceID(), nodeName, num_rows, img_width, img_height, img_channels), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<RowRepeatNode<ElemType>>(net.GetDeviceID(), nodeName, num_repeat), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceID(), nodeName, initHiddenActivity, row_size, col_size), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceID(), nodeName, initHiddenActivity, row_size, col_size), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<ParallelNode<ElemType>>(net.GetDeviceID(), nodeName), a, b);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowSlice(const ComputationNodePtr a, const size_t start_index, const size_t num_rows, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<RowSliceNode<ElemType>>(net.GetDeviceID(), nodeName, start_index, num_rows), a);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowStack(const std::vector<ComputationNodePtr> pinputs, const std::wstring nodeName)
+    {
+        vector<ComputationNodeBasePtr> inputs(pinputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+            inputs[i] = pinputs[i]; // convert to ComputationNodeBasePtr
+        return net.AddNodeToNetAndAttachInputs(New<RowStackNode<ElemType>>(net.GetDeviceID(), nodeName), inputs);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::GMMLogLikelihood(const ComputationNodePtr unnormedPrior,
+        const ComputationNodePtr mean,
+        const ComputationNodePtr logStddev,
+        const ComputationNodePtr feature,
+        const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<GMMLogLikelihoodNode<ElemType>>(net.GetDeviceID(), nodeName), unnormedPrior, mean, logStddev, feature);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::TimeReverse(const ComputationNodePtr input, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<TimeReverseNode<ElemType>>(net.GetDeviceID(), nodeName), input);
+    }
+
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LookupTable(const ComputationNodePtr dictionary, const ComputationNodePtr input, const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<LookupTableNode<ElemType>>(net.GetDeviceID(), nodeName), dictionary, input);
+    }
+
+    template class ComputationNetworkBuilder<float>;
+    template class ComputationNetworkBuilder<double>;
+
+}}}
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.h
@ -10,7 +10,7 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    template<typename ElemType>
+    template<class ElemType>
    class ComputationNetworkBuilder
    {
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp
@ -14,14 +14,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // code
    // TODO: move more code here to speed up compilation

-    template<typename ElemType>
+    template<class ElemType>
    /*virtual*/ void ComputationNode<ElemType>::MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
    {
        m_functionValues.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true, m_functionValues.HasNoElements());
        m_gradientValues.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true, m_gradientValues.HasNoElements());
    }

-    template<typename ElemType>
+    template<class ElemType>
    /*virtual*/ void ComputationNode<ElemType>::DumpNodeInfo(const bool /*printValues*/, File& fstream) const
    {
        fstream << L"\n" + NodeName() + L"=" + OperationName();
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -90,6 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
        virtual const std::wstring OperationName() const = 0;
+#define OperationNameOf(T) (T<float>::TypeName())    // we are templated, but for this the type param matters not. So we just pick one, and hide that fact.

        // TODO: make sure this does not get implemented in any of the base classes
        DEVICEID_TYPE GetDeviceId() const { return m_deviceId; }    // TODO: remove, only used from copy constructor which will go away
@ -1255,7 +1256,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // =======================================================================

    // This will provide default implementations for those two functions that will fail at runtime with a meaningful error.
-    template<typename ElemType>
+    template<class ElemType>
    class ComputationNodeNonLooping : public ComputationNode<ElemType>
    {
    public:
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@ -232,7 +232,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                LogicError("ConvolutionNode requires two inputs.");

            //we may want to remove this check in the future if we want to support the case that the weight itself is result of some computation 
-            //if (Inputs(0)->OperationName() != LearnableParameter<ElemType>::TypeName())
+            //if (Inputs(0)->OperationName() != OperationNameOf(LearnableParameter))
            //    throw std::logic_error("ConvolutionNode requires the first input to be LearnableParameter type.");

            if (m_horizontalSubsample > m_kernelWidth || m_verticalSubsample > m_kernelHeight)
@ -242,7 +242,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            size_t weightCols = m_kernelWidth * m_kernelHeight * m_inputChannels;

-            if (Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(0)->FunctionValues().HasNoElements())
+            if (Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(0)->FunctionValues().HasNoElements())
            {
                Inputs(0)->FunctionValues().Resize(m_outputChannels, weightCols);
            }
@ -255,7 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            size_t inputDim = m_inputWidth * m_inputHeight * m_inputChannels;
-            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(1)->FunctionValues().GetNumRows() == 0)
+            if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(1)->FunctionValues().GetNumRows() == 0)
            {
                Inputs(1)->FunctionValues().Resize(inputDim, Inputs(1)->FunctionValues().GetNumCols());
            }
@ -601,7 +601,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_inputSizePerSample = m_inputWidth * m_inputHeight * m_inputChannels;
            m_outputSizePerSample = m_outputWidth * m_outputHeight * m_outputChannels;

-            if (Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(0)->FunctionValues().GetNumRows() == 0)
+            if (Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(0)->FunctionValues().GetNumRows() == 0)
            {
                Inputs(0)->FunctionValues().Resize(m_inputSizePerSample, Inputs(0)->FunctionValues().GetNumCols());
            }
@ -813,7 +813,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_inputSizePerSample = m_inputWidth * m_inputHeight * m_inputChannels;
            m_outputSizePerSample = m_outputWidth * m_outputHeight * m_outputChannels;

-            if (Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(0)->FunctionValues().GetNumRows() == 0)
+            if (Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(0)->FunctionValues().GetNumRows() == 0)
            {
                Inputs(0)->FunctionValues().Resize(m_inputSizePerSample, Inputs(0)->FunctionValues().GetNumCols());
            }
--- a/MachineLearning/CNTKComputationNetworkLib/DecoderNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/DecoderNode.h
--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@ -67,7 +67,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                throw std::logic_error("ErrorPrediction operation requires two inputs.");

            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            // TODO: use dynamic_pointer_cast instead
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -75,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@ -20,6 +20,7 @@

 #include "Basics.h"
 #include "Matrix.h"
+#include "File.h"   // for LoadMatrixFromTextFile()
 #include "ComputationNode.h"

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -77,7 +78,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_outputChannels = 1;
        }

-        // TODO: also move file loading here?
+        // initialize with random numbers
        void InitRandom(const bool uniformInit,
                        const unsigned long randomSeed,
                        const ElemType initValueScale,
@ -102,6 +103,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_functionValues.TransferToDeviceIfNotThereAndNotAutoPlace(m_deviceId, true);
        }

+        // initialize by reading a matrix from a text file
+        void InitFromFile(const std::wstring & initFromFilePath)
+        {
+            size_t numRows = 0;
+            size_t numCols = 0;
+            auto array = File::LoadMatrixFromTextFile<ElemType>(msra::strfun::utf8(initFromFilePath), numRows, numCols); // TODO: change pathname to wstring
+            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, m_deviceId);
+        }
+
        virtual const std::wstring OperationName() const {return TypeName();}

        virtual void ComputeInputPartial(const size_t /*inputIndex*/) {}
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@ -808,10 +808,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if ((rows0 == 0 || cols1 == 0 ) && this->LoopId() < 0)
                throw logic_error("Times operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");

-            if ((Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
+            // TODO: use dynamic_pointer_cast
+            // TODO: why should these nodes even care whether their inputs are LearnableParmaeters? If needed, can the base class do this?
+            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
                Inputs(0)->FunctionValues().Resize(rows0, rows1);

-            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && rows1 == 0)
+            if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && cols0 != 0 && rows1 == 0)
                Inputs(1)->FunctionValues().Resize(cols0, cols1);

            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements())&& this->LoopId() < 0)
@ -970,10 +972,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if ((rows0 == 0 || cols1 == 0) && this->LoopId() < 0)
                throw logic_error("TransposeTimes operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");

-            if ((Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
+            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
                Inputs(0)->FunctionValues().Resize(rows0, rows1);

-            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && rows1 == 0)
+            if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && cols0 != 0 && rows1 == 0)
                Inputs(1)->FunctionValues().Resize(cols0, cols1);

            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements()) && this->LoopId() < 0)
@ -1089,7 +1091,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //derive number of rows if possible
            for (size_t index = 0; index < 2; index++)
            {
-                if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+                if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
                {
                    size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0 ? Inputs(1 - index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                    size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0 ? Inputs(1 - index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -1384,7 +1386,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //derive number of rows if possible
            for (size_t index = 0; index < 2; index++)
            {
-                if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+                if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
                {
                    size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0 ? Inputs(1 - index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                    size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0 ? Inputs(1 - index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -1615,7 +1617,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            //if dimention not specified we assume two operants' dimentions should be the same
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -1623,7 +1625,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -1899,7 +1901,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            //if dimention is missing make the two operatants to have same size
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -1907,7 +1909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -2046,12 +2048,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                throw std::logic_error("DiagTimes operation requires two inputs.");

            //if dimention not specified we assume two operants' dimentions should match
-            if (Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(0)->FunctionValues().GetNumRows() == 0 && Inputs(1)->FunctionValues().GetNumRows() != 0)
+            if (Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(0)->FunctionValues().GetNumRows() == 0 && Inputs(1)->FunctionValues().GetNumRows() != 0)
            {
                Inputs(0)->FunctionValues().Resize(Inputs(1)->FunctionValues().GetNumRows(), 1);
            }

-            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && Inputs(0)->FunctionValues().GetNumRows() != 0 && Inputs(1)->FunctionValues().GetNumRows() == 0)
+            if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && Inputs(0)->FunctionValues().GetNumRows() != 0 && Inputs(1)->FunctionValues().GetNumRows() == 0)
            {
                Inputs(1)->FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols());
            }
@ -2249,7 +2251,7 @@ private:

            //if dimention is missing make the two operatants to have same size
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -2257,7 +2259,7 @@ private:
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -2426,10 +2428,10 @@ private:
            if (rows0 == 0 || rows1 == 0)
                throw logic_error("KhatriRaoProduct operation: The number of rows in the input should not be 0.");

-            if (Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && cols1 != 0)
+            if (Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && cols1 != 0)
                Inputs(0)->FunctionValues().Resize(rows0, cols1);

-            if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && cols1 == 0)
+            if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && cols0 != 0 && cols1 == 0)
                Inputs(1)->FunctionValues().Resize(rows1, cols0);

            //cols may be changed before this line and so cannot use cached cols values below
@ -2655,7 +2657,7 @@ private:

            //if dimention is missing make the two operatants to have same size
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0 ? Inputs(1 - index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0 ? Inputs(1 - index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -2663,7 +2665,7 @@ private:
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0 ? Inputs(1 - index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0 ? Inputs(1 - index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
--- a/MachineLearning/CNTKComputationNetworkLib/MatrixPool.h
+++ b/MachineLearning/CNTKComputationNetworkLib/MatrixPool.h
@ -25,7 +25,7 @@ namespace Microsoft {
                void GetReleasedMatrices(vector<shared_ptr<Matrix<float>>>  * releasedMatrices) { releasedMatrices = &m_releasedFloatMatrices; }
                void GetReleasedMatrices(vector<shared_ptr<Matrix<double>>> * releasedMatrices) { releasedMatrices = &m_releasedDoubleMatrices; }
            public:
-                template<typename ElemType>
+                template<class ElemType>
                void Release(const shared_ptr<Matrix<ElemType>> & freeMatrix)
                {
                    vector<shared_ptr<Matrix<float>>> * releasedMatrices;
@ -35,7 +35,7 @@ namespace Microsoft {
                    releasedMatrices->push_back(freeMatrix);
                }

-                template<typename ElemType>
+                template<class ElemType>
                shared_ptr<Matrix<ElemType>> Request(DEVICEID_TYPE deviceId = AUTOPLACEMATRIX)
                {
                    vector<shared_ptr<Matrix<float>>> * releasedMatrices;
--- a/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
@ -0,0 +1,767 @@
+// NetworkBuilderFromConfig.cpp -- interface to node and network creation from glue languages through config record parameters  --fseide
+
+#define _CRT_SECURE_NO_WARNINGS     // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+
+#include "Basics.h"
+#include "BrainScriptEvaluator.h"
+
+#include "ComputationNode.h"
+#include "InputAndParamNodes.h"
+#include "RecurrentNodes.h"
+#include "NonlinearityNodes.h"
+#include "LinearAlgebraNodes.h"
+#include "ConvolutionalNodes.h"
+
+#include "ComputationNetwork.h"
+#include "ComputationNetworkBuilder.h"
+
+#include <memory>
+#include <deque>
+#include <set>
+#include <string>
+
+#ifndef let
+#define let const auto
+#endif
+
+namespace Microsoft { namespace MSR { namespace BS {
+
+    using namespace Microsoft::MSR;
+
+    // The following class(es) implement the MakeRuntimeObject() function for different types. Sorry for the strange template dance.
+
+    // -------------------------------------------------------------------
+    // basic function template, for classes that can instantiate themselves from IConfigRecordPtr  TODO: do we even have any?
+    // -------------------------------------------------------------------
+
+    template<typename ElemType, class C>
+    struct DualPrecisionHelpers
+    {
+        static shared_ptr<Object> MakeRuntimeObject(const IConfigRecordPtr config) { return make_shared<C>(config); }
+    };
+
+    // -------------------------------------------------------------------
+    // ComputationNode -- covers all standard nodes
+    // -------------------------------------------------------------------
+
+    // helper wrapper class for ComputationNodes that must AttachInputs() late due to circular references
+    // Instantiate with LateAttachingNode<node type>(lambda, args for node constructor).
+    // To resolve, call AttachInputs()
+    // TODO: This is a bit indirect. Can it be done more nicely?
+    struct ILateAttachingNode { virtual void LateAttachInputs() = 0; };
+    template<class N>
+    class LateAttachingNode : public N, public ILateAttachingNode
+    {
+        typedef typename N::OurElemType ElemType;
+        function<void(ComputationNode<ElemType>*)> attachInputs;
+    public:
+        // constructor
+        template<class... _Types>
+        LateAttachingNode(DEVICEID_TYPE deviceId, const wstring & name, const function<void(ComputationNode<ElemType>*)> & attachInputs, _Types&&... _Args) : attachInputs(attachInputs), N(deviceId, name, forward<_Types>(_Args)...) {}
+        // the one member that does the work
+        void /*ILateAttachingNode::*/LateAttachInputs()
+        {
+            attachInputs(dynamic_cast<N*>(this));
+            attachInputs = [](ComputationNode<ElemType>*){ LogicError("LateAttachingNode::AttachInputs: must only be called once"); };
+        }
+    };
+
+    template<class ElemType>
+    struct DualPrecisionHelpers<ElemType, ComputationNode<ElemType>>
+    {
+        // create ComputationNode
+        // This is the equivalent of the old SynchronousNodeEvaluator::Evaluate(), and we duplicate code from there.
+        static shared_ptr<Object> MakeRuntimeObject(const IConfigRecordPtr configp)
+        {
+            let & config = *configp;
+            wstring operationName = config[L"operation"];
+            wstring nodeName = L"<placeholder>";   // name will be overwritten by caller upon return (TODO: fix this here? pass expression name in?)
+            DEVICEID_TYPE deviceId = (DEVICEID_TYPE)(int)config[L"deviceId"];
+            static unsigned long m_randomSeedOffset = 0;    // TODO: this is held in the ComputationNetwork, but we don't have one yet
+            // TODO" ^^ actually it seems only used by initialization of LearnableParameters--check that again; in that case, we can have a local
+
+            // note on optional parameters
+            // Instead of defining optional parameters here in code, they are defined as optional args to the creating macro.
+
+            ComputationNodeBasePtr node;
+
+#define OpIs(op) (operationName == msra::strfun::utf16(OperationNameOf(op)))
+
+            // TODO: in the code below, for reference, each block is preceded by an #if-0'ed out copy of the respective code from SynchronousNodeEvaluator::Evaluate()--remove these when this all works
+
+            // first group: nodes without inputs
+#if 0
+            if (OperationNameOf(InputValue) == cnNodeType)
+            {
+                if (parameter.size() < 1 || parameter.size() > 2)
+                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                    // first look for this node already existing in the network
+                    if (m_net.NodeNameExist(name))
+                        nodePtr = m_net.GetNodeFromName(name);
+                    else
+                        nodePtr = m_net.CreateInputNode(name, rows, cols);
+                }
+            }
+            else if (InputValue<ElemType>::SparseTypeName() == cnNodeType)
+            {
+                if (parameter.size() < 1 || parameter.size() > 2)
+                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                    // first look for this node already existing in the network
+                    if (m_net.NodeNameExist(name))
+                        nodePtr = m_net.GetNodeFromName(name);
+                    else
+                        nodePtr = m_net.CreateSparseInputNode(name, rows, cols);
+                }
+            }
+            else if (cnNodeType == L"ImageInput")
+            {
+                if (parameter.size() < 3 || parameter.size() > 4)
+                    RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
+                    size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
+                    size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
+
+                    nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
+                }
+            }
+            else if (cnNodeType == L"SparseImageInput")
+            {
+                if (parameter.size() < 3 || parameter.size() > 4)
+                    RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
+                    size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
+                    size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
+
+                    nodePtr = m_net.CreateSparseInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
+                }
+            }
+#endif
+            if (OpIs(InputValue))
+            {
+                let isSparse = config(L"isSparse");
+                let isImage = config(L"isImage");
+                if (!isImage)
+                    node = New<InputValue<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"], isSparse);
+                else
+                    node = New<InputValue<ElemType>>(deviceId, nodeName, (size_t)config[L"imageWidth"], (size_t)config[L"imageHeight"], (size_t)config[L"imageChannels"], (size_t)config[L"numImages"], isSparse);
+            }
+#if 0
+            else if (OperationNameOf(LearnableParameter) == cnNodeType)
+            {
+                if (parameter.size() < 1 || parameter.size() > 2)
+                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                    bool needGradient = node->GetOptionalParameter("needGradient", "true");
+
+                    nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
+
+                    nodePtr->NeedGradient() = needGradient;
+                }
+                else if (pass == ndlPassFinal)
+                {
+                    static int randomSeed = 1;
+                    std::string initString = node->GetOptionalParameter("init", "uniform");
+                    ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
+                    ElemType value = node->GetOptionalParameter("value", "0");
+
+                    msra::strfun::tolower_ascii(initString);
+                    if (initString == "fixedvalue")
+                        nodePtr->FunctionValues().SetValue(value);
+                    else if (initString == "uniform")
+                        m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
+                    else if (initString == "gaussian")
+                        m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
+                    else if (initString == "fromfile")
+                    {
+                        std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
+                        if (initFromFilePath == "")
+                            RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
+                        if (initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size() - 1] == '\"')
+                            // remove the opening and closing double quotes
+                            initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size() - 2);
+                        if (!fexists(initFromFilePath))
+                            RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
+                        m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                    }
+                    else
+                        RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
+                }
+            }
+            else if (OperationNameOf(SparseLearnableParameter) == cnNodeType)
+            {
+                if (parameter.size() < 1 || parameter.size() > 2)
+                    RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                    bool needGradient = node->GetOptionalParameter("needGradient", "true");
+
+                    nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols);
+
+                    nodePtr->NeedGradient() = needGradient;
+                }
+                else if (pass == ndlPassFinal)
+                {
+                    static int randomSeed = 1;
+                    std::string initString = node->GetOptionalParameter("init", "uniform");
+                    ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
+                    ElemType value = node->GetOptionalParameter("value", "0");
+
+                    msra::strfun::tolower_ascii(initString);
+                    if (initString == "fixedvalue")
+                        nodePtr->FunctionValues().SetValue(value);
+                    else if (initString == "uniform")
+                        m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
+                    else if (initString == "gaussian")
+                        m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
+                    else if (initString == "fromfile")
+                    {
+                        std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
+                        if (initFromFilePath == "")
+                            RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
+                        if (initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size() - 1] == '\"')
+                            // remove the opening and closing double quotes
+                            initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size() - 2);
+                        if (!fexists(initFromFilePath))
+                            RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
+                        m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                    }
+                    else
+                        RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
+                }
+            }
+#endif
+            else if (OpIs(LearnableParameter) || OpIs(SparseLearnableParameter))
+            {
+                // parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
+                // TODO: do we need a default value mechanism? How to make sure it does not pop upwards? Current functions do not allow overloads.
+                // TODO: test this with random init for QuickE2E on CPU against SimpleNetworkBuilder
+                let isSparse = (operationName.find(L"Sparse") != wstring::npos);
+                if (!isSparse)
+                    node = New<LearnableParameter<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"]);
+                else
+                    node = New<SparseLearnableParameter<ElemType>>(deviceId, nodeName, (size_t)config[L"rows"], (size_t)config[L"cols"], 0/*size*/);    // TODO: what is size?
+                node->NeedGradient() = config[L"needGradient"];
+                static int randomSeed = 1;
+                wstring initString = config[L"init"];
+                if (initString == L"fixedValue")
+                    dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->FunctionValues().SetValue((ElemType)config[L"value"]);
+                else if (initString == L"uniform" || initString == L"gaussian")
+                {
+                    // TODO: add these options also to old NDL
+                    int forcedRandomSeed = config[L"randomSeed"];   // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
+                    dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->InitRandom((initString == L"uniform"), forcedRandomSeed < 0 ? (randomSeed++ + m_randomSeedOffset) : (unsigned long)forcedRandomSeed, config[L"initValueScale"], config[L"initOnCPUOnly"]);
+                }
+                else if (initString == L"fromFile")
+                {
+                    wstring initFromFilePath = config[L"initFromFilePath"];
+                    if (initFromFilePath.empty())
+                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
+                    dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->InitFromFile(initFromFilePath);
+                }
+                else
+                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedValue|fromFile]");
+            }
+#if 0
+            else if (cnNodeType == L"Constant")
+            {
+                if (parameter.size() != 1)
+                    RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]].");
+
+                if (pass == ndlPassInitial)
+                {
+                    size_t rows = node->GetOptionalParameter("rows", "1");
+                    size_t cols = node->GetOptionalParameter("cols", "1");
+
+                    nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
+                    nodePtr->NeedGradient() = false;
+                }
+                else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
+                {
+                    double val = parameter[0]->GetScalar();
+                    nodePtr->FunctionValues().SetValue(val);
+                }
+            }
+#endif
+            // Constant is implemented as a LearnableParameter with initializion as fixedValue with needGradient false, on script level
+#if 0
+            else if (cnNodeType == OperationNameOf(PastValueNode) ||
+                cnNodeType == OperationNameOf(FutureValueNode))
+            {
+                if (parameter.size() <2 || parameter.size() >3)
+                    RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
+
+                nodeParamCount = 1;
+                nodeParamStart = parameter.size() > 2 ? 2 : 1;
+
+                if (pass == ndlPassInitial)
+                {
+                    // evaluate only scalar parameters
+                    vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                    size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                    // if we have three parameters the second is columns
+                    size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                    bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                    float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1");
+
+                    //for backward compatibility we check timeStep first
+                    size_t timeStep = node->GetOptionalParameter("timeStep", "1");
+                    if (timeStep == 1)
+                    {
+                        timeStep = node->GetOptionalParameter("delayTime", "1");
+                    }
+
+                    if (cnNodeType == OperationNameOf(PastValueNode))
+                    {
+                        nodePtr = m_net.PastValue(NULL, defaultHiddenActivity, rows, cols, name);
+                        static_pointer_cast<PastValueNode<ElemType>>(nodePtr)->SetTimeStep(timeStep);
+                    }
+                    else
+                    {
+                        nodePtr = m_net.FutureValue(NULL, defaultHiddenActivity, rows, cols, name);
+                        static_pointer_cast<FutureValueNode<ElemType>>(nodePtr)->SetTimeStep(timeStep);
+                    }
+
+                    nodePtr->NeedGradient() = needGradient; // TODO: What for?
+                }
+            }
+#endif
+            // nodes with delayed inputs, where we cannot yet resolve inputs due to circular references
+            else if (OpIs(PastValueNode) || OpIs(FutureValueNode)) // TODO: untested
+            {
+                // rows, cols, input, [timeStep=1, defaultHiddenActivation=0.1]
+                // Note: changed names of optional args compared to current NDL
+                // TODO: we really should NOT have to specify the dimensions; network builder can figure it out. Keep it for now, fix when it is time.
+                // We instantiate not the node directly, but a wrapped version that can cast to LateAttachingNode, which holds a lambda to complete the attachment process at the appropriate time.
+                function<void(ComputationNode<ElemType>*)> completeAttachInputs = [configp](ComputationNode<ElemType>* node)   // This is the lambda to complete the process. Note that config captured as a shared_ptr.
+                {
+                    node->AttachInputs(GetInputs(*configp));    // this is executed by network builder while iterating the nodes
+                };
+                if (OpIs(PastValueNode))
+                    node = New<LateAttachingNode<PastValueNode<ElemType>>>(deviceId, nodeName, completeAttachInputs, (ElemType)config[L"defaultHiddenActivation"], (size_t)config[L"rows"], (size_t)config[L"cols"], (size_t)config[L"timeStep"]);
+                else
+                    node = New<LateAttachingNode<FutureValueNode<ElemType>>>(deviceId, nodeName, completeAttachInputs, (ElemType)config[L"defaultHiddenActivation"], (size_t)config[L"rows"], (size_t)config[L"cols"], (size_t)config[L"timeStep"]);
+            }
+            else        // nodes with inputs
+            {
+                let inputs = GetInputs(config);
+                // second group: nodes with special initializers
+#if 0
+                /*else*/ if (cnNodeType == OperationNameOf(RowSliceNode))
+                {
+                    if (parameter.size() != 3)
+                        RuntimeError("RowSlice should have three parameters. Usage: RowSlice(startRowIndex, numRows, origNodeName.");
+
+                    nodeParamCount = 1;
+                    nodeParamStart = 2;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                        size_t start_index = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                        size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
+
+                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                        nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name);
+                        nodePtr->NeedGradient() = needGradient;
+                    }
+                }
+#endif
+                if (OpIs(RowSliceNode)) // TODO: untested
+                {
+                    // startIndex, numRows, inputs /*one*/, needGradient=false
+                    node = New<RowSliceNode<ElemType>>(deviceId, nodeName, (size_t)config[L"startIndex"], (size_t)config[L"numRows"]);
+                    node->NeedGradient() = config[L"needGradient"];
+                }
+#if 0
+                else if (cnNodeType == OperationNameOf(RowRepeatNode))
+                {
+                    if (parameter.size() != 2)
+                        RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats.");
+
+                    nodeParamCount = 1;
+                    nodeParamStart = 0;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                        size_t num_repeat = ((NDLNode<ElemType>*)params[1])->GetScalar();
+
+                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                        nodePtr = m_net.RowRepeat(NULL, num_repeat, name);
+                        nodePtr->NeedGradient() = needGradient;
+                    }
+                }
+#endif
+                else if (OpIs(RowRepeatNode)) // TODO: untested
+                {
+                    // inputs /*one*/, numRepeats, needGradient=false
+                    node = New<RowRepeatNode<ElemType>>(deviceId, nodeName, (size_t)config[L"numRepeats"]);
+                    node->NeedGradient() = config[L"needGradient"];
+                }
+#if 0
+                else if (cnNodeType == OperationNameOf(ReshapeNode))
+                {
+                    if (parameter.size() < 2 || parameter.size() > 5)
+                        RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
+
+                    nodeParamCount = 1;
+                    nodeParamStart = 0;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                        size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
+                        size_t img_width = node->GetOptionalParameter("imageWidth", "0");
+                        size_t img_height = node->GetOptionalParameter("imageHeight", "0");
+                        size_t img_channels = node->GetOptionalParameter("imageChannels", "0");
+
+                        bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                        nodePtr = m_net.Reshape(NULL, num_rows, img_width, img_height, img_channels, name);
+                        nodePtr->NeedGradient() = needGradient;
+                    }
+                }
+#endif
+                else if (OpIs(ReshapeNode)) // TODO: untested
+                {
+                    // inputs /*one*/, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0
+                    node = New<ReshapeNode<ElemType>>(deviceId, nodeName, (size_t)config[L"numRows"], (size_t)config[L"imageWidth"], (size_t)config[L"imageHeight"], (size_t)config[L"imageChannels"]);
+                    node->NeedGradient() = config[L"needGradient"];
+                    //nodePtr = m_net.Reshape(NULL, num_rows, img_width, img_height, img_channels, name);
+                    // BUGBUG: ^^ how to implement this?? We got no network here. What is this for?
+                    LogicError("ReshapeNode not working with BS because init code needs access to network which we don't haveyet--to be fixed elsewhere");
+                }
+#if 0
+                else if (cnNodeType == OperationNameOf(ConvolutionNode))
+                {
+                    if (parameter.size() != 7)
+                        RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str());
+
+                    // setup the parameter position of children so we can hook them up later
+                    nodeParamCount = 2;
+                    nodeParamStart = 0;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        int id = 2; // skip weightNode and inputValueNode
+
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                        id = 0; // reset counter because the params array starts at zero
+                        size_t kernelWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t kernelHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+
+                        assert(id == 5);
+
+                        //optional
+                        bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
+                        size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
+
+
+                        nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
+                            horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
+                    }
+                }
+#endif
+                else if (OpIs(ConvolutionNode)) // TODO: untested
+                {
+                    // weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
+                    node = New<ConvolutionNode<ElemType>>(deviceId, nodeName, (size_t)config[L"kernelWidth"], (size_t)config[L"kernelHeight"], (size_t)config[L"outputChannels"],
+                                                                              (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"],
+                                                                              (bool)config[L"zeroPadding"], (size_t)config[L"maxTempMemSizeInSamples"]);
+                }
+#if 0
+                else if (cnNodeType == OperationNameOf(MaxPoolingNode))
+                {
+                    if (parameter.size() != 5)
+                        RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
+
+                    // setup the parameter position of children so we can hook them up later
+                    nodeParamCount = 1;
+                    nodeParamStart = 0;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        int id = 1; // skip inputValueNode
+
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                        id = 0; // reset counter because the params array starts at zero
+                        size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+
+                        assert(id == 4);
+
+                        nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
+                            horizontalSubsample, verticalSubsample, name);
+                    }
+                }
+#endif
+                else if (OpIs(MaxPoolingNode)) // TODO: untested
+                {
+                    // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
+                    node = New<MaxPoolingNode<ElemType>>(deviceId, nodeName, (size_t)config[L"windowWidth"], (size_t)config[L"windowHeight"], (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"]);
+                }
+#if 0
+                else if (cnNodeType == OperationNameOf(AveragePoolingNode))
+                {
+                    if (parameter.size() != 5)
+                        RuntimeError("%ls should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
+
+                    // setup the parameter position of children so we can hook them up later
+                    nodeParamCount = 1;
+                    nodeParamStart = 0;
+
+                    if (pass == ndlPassInitial)
+                    {
+                        int id = 1; // skip inputValueNode
+
+                        // evaluate only scalar parameters
+                        vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                        id = 0; // reset counter because the params array starts at zero
+                        size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                        size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+
+                        assert(id == 4);
+
+                        nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
+                            horizontalSubsample, verticalSubsample, name);
+                    }
+                }
+#endif
+                else if (OpIs(AveragePoolingNode)) // TODO: untested
+                {
+                    // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
+                    node = New<AveragePoolingNode<ElemType>>(deviceId, nodeName, (size_t)config[L"windowWidth"], (size_t)config[L"windowHeight"], (size_t)config[L"horizontalSubsample"], (size_t)config[L"verticalSubsample"]);
+                }
+                // last group: standard nodes that only take 'inputs'
+                else
+                {
+                    node = ComputationNetworkBuilder<ElemType>::NewStandardNode(operationName, deviceId, nodeName);
+                }
+                node->AttachInputs(inputs); // TODO: where to check the number of inputs? Should be a template parameter to ComputationNode!
+            }
+            // add a tag
+            let nodeWithTag = dynamic_pointer_cast<WithTag>(node);
+            if (nodeWithTag)
+                nodeWithTag->SetTag(config[L"tag"]);
+            // and done
+            return node;
+        }
+    private:
+        // helper for the factory function for ComputationNodes
+        static vector<ComputationNodeBasePtr> GetInputs(const IConfigRecord & config)
+        {
+            vector<ComputationNodeBasePtr> inputs;
+            let inputsArg = config[L"inputs"];
+            if (inputsArg.Is<ComputationNodeBase>())                // single arg
+                inputs.push_back(inputsArg);
+            else                                                    // a whole vector
+            {
+                ConfigArrayPtr inputsArray = (ConfigArrayPtr&)inputsArg;
+                let range = inputsArray->GetIndexRange();
+                for (int i = range.first; i <= range.second; i++)   // pull them. This will resolve all of them.
+                    inputs.push_back(inputsArray->At(i, inputsArg.GetLocation()));
+            }
+            return inputs;
+        }
+    };
+
+    // -------------------------------------------------------------------
+    // ComputationNetwork
+    // -------------------------------------------------------------------
+
+    // initialize a ComputationNetwork from a ConfigRecord
+    template<>
+    /*static*/ shared_ptr<Object> MakeRuntimeObject<ComputationNetwork>(const IConfigRecordPtr configp)
+    {
+        let & config = *configp;
+
+        DEVICEID_TYPE deviceId = (DEVICEID_TYPE)(int)config[L"deviceId"];
+        auto net = make_shared<ComputationNetwork>(deviceId);
+
+        auto & m_nameToNodeMap = net->GetNameToNodeMap();
+
+        deque<ComputationNodeBasePtr> workList;
+        // flatten the set of all nodes
+        // we collect all root ComputationNodes from the config record, and then expand into all their children by work-list processing
+        // TODO: This currently only collects nodes of the same ElemType. We could allow conversion operators.
+        // TODO: Can we even make the ComputationNetwork independent of ElemType?? As long as the nodes themselves are hooked up properly that should be OK!
+        for (let & id : config.GetMemberIds())
+        {
+            let & value = config[id];
+            if (value.Is<ComputationNodeBase>())
+                workList.push_back((ComputationNodeBasePtr&)value);
+        }
+        // process work list
+        // Also call FinalizeInit where we must.
+        while (!workList.empty())
+        {
+            let node = workList.front();
+            workList.pop_front();
+
+            // add to set
+            let res = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
+            if (!res.second)        // not inserted: we already got this one
+                if (res.first->second == node)
+                    continue;       // the same
+                else                // oops, a different node with the same name
+                    LogicError("ComputationNetwork: multiple nodes with the same NodeName() '%ls'", node->NodeName().c_str());
+
+            // If node derives from MustFinalizeInit() then it has unresolved inputs. Resolve them now.
+            // This may generate a whole new load of nodes, including nodes which in turn have late init.
+            // TODO: think this through whether it may generate circular references nevertheless
+            let lateAttachingNode = dynamic_pointer_cast<ILateAttachingNode>(node);
+            if (lateAttachingNode)
+                lateAttachingNode->LateAttachInputs();
+
+            // add it to the respective node group based on the tag
+            let nodeWithTag = dynamic_pointer_cast<WithTag>(node);
+            if (nodeWithTag)
+            {
+                wstring tag = nodeWithTag->GetTag();
+                if (tag == L"feature")                              net->FeatureNodes().push_back(node);
+                else if (tag == L"label")                           net->LabelNodes().push_back(node);
+                else if (tag == L"criterion" || tag == L"criteria") net->FinalCriterionNodes().push_back(node); // 'criteria' is wrong (plural); we keep it for compat
+                else if (!_wcsnicmp(tag.c_str(), L"eval", 4))       net->EvaluationNodes().push_back(node);     // eval*
+                else if (tag == L"output")                          net->OutputNodes().push_back(node);
+                else if (tag == L"pair")                            net->PairNodes().push_back(node);           // TODO: I made this up; the original code in SynchronousExecutionEngine did not have this
+                else if (tag == L"multiseq")                        net->NodesReqMultiSeqHandling().push_back(node);
+                else if (!tag.empty())
+                    RuntimeError("ComputationNetwork: unknown tag '%ls'", tag.c_str());
+                // TODO: are there nodes without tag? Where do they go?
+            }
+
+            // TODO: ...can we do stuff like propagating dimensions here? Or still too early?
+
+            // traverse children: append them to the end of the work list
+            let children = node->GetChildren();
+            for (auto child : children)
+                workList.push_back(child);  // (we could check whether c is in 'nodes' already here to optimize, but this way it is cleaner)
+        }
+
+        // TODO: what is missing is the dimensions
+#if 1
+        wstring args = net->ToString();
+        fprintf(stderr, "%ls\n", args.c_str());
+#endif
+        // these post-processing steps are done by the other network builders, but I don't know why they are necessary
+        net->FixupInputMinibatchSize();         // make sure dimensions are set up correctly
+        net->ResetEvalTimeStamp();              // (should not really be needed)
+        return net;
+    }
+
+    // creates the lambda for creating an object that can exist as 'float' or 'double'
+    // Pass both types as the two template args.
+    template<class Cfloat, class Cdouble>
+    static ConfigurableRuntimeType MakeRuntimeTypeConstructorDualPrecision()
+    {
+        ConfigurableRuntimeType rtInfo;
+        rtInfo.construct = [](const IConfigRecordPtr config)        // lambda to construct--this lambda can construct both the <float> and the <double> variant based on config parameter 'precision'
+        {
+            wstring precision = (*config)[L"precision"];            // dispatch on ElemType
+            if (precision == L"float")
+                return DualPrecisionHelpers<float, Cfloat>::MakeRuntimeObject(config);
+            else if (precision == L"double")
+                return DualPrecisionHelpers<double, Cdouble>::MakeRuntimeObject(config);
+            else
+                RuntimeError("invalid value for 'precision', must be 'float' or 'double'");
+        };
+        rtInfo.isConfigRecord = is_base_of<IConfigRecord, Cfloat>::value;
+        static_assert(is_base_of<IConfigRecord, Cfloat>::value == is_base_of<IConfigRecord, Cdouble>::value, "");   // we assume that both float and double have the same behavior
+        return rtInfo;
+    }
+
+    // and the regular one without ElemType dependency
+    template<class C>
+    static ConfigurableRuntimeType MakeRuntimeTypeConstructor()
+    {
+        ConfigurableRuntimeType rtInfo;
+        rtInfo.construct = [](const IConfigRecordPtr config)        // lambda to construct--this lambda can construct both the <float> and the <double> variant based on config parameter 'precision'
+        {
+            return MakeRuntimeObject<C>(config);
+        };
+        rtInfo.isConfigRecord = is_base_of<IConfigRecord, C>::value;
+        return rtInfo;
+    }
+
+#define DefineRuntimeType(T) { L ## #T, MakeRuntimeTypeConstructor<T>() }
+#define DefineRuntimeTypeDualPrecision(T) { L ## #T, MakeRuntimeTypeConstructorDualPrecision<T<float>,T<double>>() }
+
+    // get information about configurable runtime types
+    // This returns a ConfigurableRuntimeType structure which primarily contains a lambda to construct a runtime object from a ConfigRecord ('new' expression).
+    const ConfigurableRuntimeType * FindExternalRuntimeTypeInfo(const wstring & typeId)
+    {
+        // lookup table for "new" expression
+        // This table lists all C++ types that can be instantiated from "new" expressions, and gives a constructor lambda and type flags.
+        static map<wstring, ConfigurableRuntimeType> configurableRuntimeTypes =
+        {
+            // ComputationNodes
+            DefineRuntimeTypeDualPrecision(ComputationNode),
+            DefineRuntimeType(ComputationNetwork),
+#if 0
+            DefineRuntimeType(RecurrentComputationNode),
+            // In this experimental state, we only have Node and Network.
+            // Once BrainScript becomes the driver of everything, we will add other objects like Readers, Optimizers, and Actions here.
+#endif
+        };
+
+        // first check our own
+        let newIter = configurableRuntimeTypes.find(typeId);
+        if (newIter != configurableRuntimeTypes.end())
+            return &newIter->second;
+        return nullptr; // not found
+    }
+
+}}}
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@ -1125,7 +1125,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
            ComputeInputPartialS(m_dropoutRate, sliceInput0Grad, sliceMask, sliceOutputGrad);
        }

-        static void WINAPI ComputeInputPartialS(const ElemType dropoutRate, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& maskOfDropout, const Matrix<ElemType>& gradientValues)
+        static void WINAPI ComputeInputPartialS(const double dropoutRate, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& maskOfDropout, const Matrix<ElemType>& gradientValues)
        {
            if (dropoutRate > 0)
            {
@ -1159,13 +1159,13 @@ virtual const std::wstring OperationName() const { return TypeName(); }
            EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
        }

-        static void WINAPI EvaluateThisNodeS(const ElemType dropoutRate, unsigned long& randomSeed, Matrix<ElemType>& functionValues, Matrix<ElemType>& maskOfDropout, const Matrix<ElemType>& inputFunctionValues)
+        static void WINAPI EvaluateThisNodeS(const double dropoutRate, unsigned long& randomSeed, Matrix<ElemType>& functionValues, Matrix<ElemType>& maskOfDropout, const Matrix<ElemType>& inputFunctionValues)
        {
            if (dropoutRate > 0)
            {
                maskOfDropout.Resize(inputFunctionValues.GetNumRows(), inputFunctionValues.GetNumCols());

-                maskOfDropout.SetUniformRandomMask(dropoutRate, ElemType(1.0) / (ElemType(1) - dropoutRate), randomSeed);
+                maskOfDropout.SetUniformRandomMask((ElemType)dropoutRate, (ElemType)(1.0 / (1.0 - dropoutRate)), randomSeed);
                randomSeed += 1073807359;  //1073807359 is a very large prime number to avoid collision with other dropout nodes

                functionValues.AssignElementProductOf(maskOfDropout, inputFunctionValues);
@ -1217,7 +1217,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
            m_children[0] = inputNode;
        }

-        void SetDropoutRate(const ElemType val)
+        void SetDropoutRate(const double val)
        {
            if (val < 0 || val >= 1)
                throw std::logic_error("DropoutRate must be >= 0 and < 1.");
@ -1249,7 +1249,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
            }
        }
 private:
-        ElemType m_dropoutRate;
+        double m_dropoutRate;
        unsigned long m_randomSeed;

        Matrix<ElemType> m_maskOfDropout;
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@ -1284,10 +1284,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Inputs(0)->FunctionValues().GetMatrixType() == SPARSE)
                LogicError("LSTMNode: input to LSTM has to be dense matrix. Consider adding a project layer using lookuptable before LSTM node. ");

-            if (Inputs(1)->OperationName() != LearnableParameter<ElemType>::TypeName() ||
-                Inputs(2)->OperationName() != LearnableParameter<ElemType>::TypeName() ||
-                Inputs(3)->OperationName() != LearnableParameter<ElemType>::TypeName() ||
-                Inputs(4)->OperationName() != LearnableParameter<ElemType>::TypeName())
+            // TODO: use dynamic_pointer_cast instead
+            if (Inputs(1)->OperationName() != OperationNameOf(LearnableParameter) ||
+                Inputs(2)->OperationName() != OperationNameOf(LearnableParameter) ||
+                Inputs(3)->OperationName() != OperationNameOf(LearnableParameter) ||
+                Inputs(4)->OperationName() != OperationNameOf(LearnableParameter))
                throw std::logic_error("LSTM validation: need to have learnable parameters ");

            if (Inputs(0)->FunctionValues().HasNoElements())
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@ -77,7 +77,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                LogicError("SquareError operation requires two inputs.");

            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -85,7 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -245,7 +245,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -253,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -396,7 +396,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -404,7 +404,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            if (Inputs(index)->OperationName() == OperationNameOf(LearnableParameter))
            {
                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
@ -783,7 +783,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            if (m_children.size() != 4)
                LogicError("NoiseContrastiveEstimationNode criterion requires four inputs.");
-            if (Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
+            if (Inputs(0)->OperationName() != OperationNameOf(InputValue))
                LogicError("NoiseContrastiveEstimationNode criterion requires the first input to be the label.");
            if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows())) // input and matrix can be timed
                LogicError("The Matrix<ElemType>  dimension for observation and weight in the NoiseContrastiveEstimationNode operation does not match.");
@ -1134,7 +1134,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            if (m_children.size() != 4)
                LogicError("ClassBasedCrossEntropyWithSoftmaxNode criterion requires four inputs.");
-            if (Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
+            if (Inputs(0)->OperationName() != OperationNameOf(InputValue))
                LogicError("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
            if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows())) // input and matrix can be timed
                LogicError("The Matrix<ElemType>  dimension for observation and weight in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match.");
--- a/MachineLearning/CNTKComputationNetworkLib/stdafx.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/stdafx.cpp
@ -0,0 +1,13 @@
+//
+// <copyright file="stdafx.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// stdafx.cpp : source file that includes just the standard includes
+// cn.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- a/MachineLearning/CNTKComputationNetworkLib/stdafx.h
+++ b/MachineLearning/CNTKComputationNetworkLib/stdafx.h
@ -0,0 +1,20 @@
+//
+// <copyright file="stdafx.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#ifdef _WIN32
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms
+#include "targetver.h"
+#endif
+
+#include <stdio.h>
+
+// TODO: reference additional headers your program requires here
--- a/MachineLearning/CNTKComputationNetworkLib/targetver.h
+++ b/MachineLearning/CNTKComputationNetworkLib/targetver.h
@ -0,0 +1,13 @@
+//
+// <copyright file="targetver.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include <SDKDDKVer.h>
--- a/MachineLearning/CNTKEval/CNTKEval.cpp
+++ b/MachineLearning/CNTKEval/CNTKEval.cpp
@ -10,6 +10,7 @@
 #define EVAL_EXPORTS  // creating the exports here
 #include "Eval.h"
 #include "CNTKEval.h"
+#include "CPUMatrix.h"  // for SetNumThreads()
 #include "SimpleOutputWriter.h"
 #ifdef LEAKDETECT
 #include <vld.h> // leak detection
@ -45,7 +46,6 @@ void CNTKEval<ElemType>::Init(const std::string& config)
    }
    size_t nThread = m_config("numCPUThreads", "1");
    CPUMatrix<ElemType>::SetNumThreads(nThread);    
-        
 }

 // Destroy - cleanup and remove this class
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj
@ -22,7 +22,7 @@
    </SccProvider>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>CNTKEval</RootNamespace>
-    <ProjectName>CNTKEval</ProjectName>
+    <ProjectName>CNTKEvalDll</ProjectName>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@ -50,14 +50,14 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\CNTK;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
-    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(CUDA_PATH)\lib\$(Platform);$(SolutionDir)$(Platform)\;$(Configuration);$(SolutionDir)..\Common\lib;$(SolutionDir)..\CNTK\Common\lib;$(Configuration)\;$(SolutionDir)..\..\cntk\Common\lib;$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\;$(Platform)</LibraryPath>
+    <IncludePath>..\CNTKSGDLib;..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>..\CNTKComputationNetworkLib;..\..\Math\Math;$(CUDA_PATH)\lib\$(Platform);$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(Platform)</LibraryPath>
    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\CNTK;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
-    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(CUDA_PATH)\lib\$(Platform);$(SolutionDir)$(Platform)\;$(Configuration);$(SolutionDir)..\Common\lib;$(SolutionDir)..\CNTK\Common\lib;$(Configuration)\;$(SolutionDir)..\..\cntk\Common\lib;$(Configuration)\;$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64)</LibraryPath>
+    <IncludePath>..\CNTKSGDLib;..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>..\CNTKComputationNetworkLib;..\..\Math\Math;$(CUDA_PATH)\lib\$(Platform);$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(Platform)</LibraryPath>
    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -74,12 +74,12 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib; nvml.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKComputationNetworkLib.lib; CNTKMathDll.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
-      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMathDll.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -104,13 +104,13 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKMath.lib; nvml.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>CNTKComputationNetworkLib.lib; CNTKMathDll.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
      <Profile>true</Profile>
-      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMathDll.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -128,9 +128,6 @@
    <ClInclude Include="CNTKEval.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\BestGpu.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="..\..\Common\ConfigFile.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
@ -142,11 +139,6 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
-    <ClCompile Include="..\CNTK\ComputationNetwork.cpp" />
-    <ClCompile Include="..\CNTK\ComputationNetworkBuilder.cpp" />
-    <ClCompile Include="..\CNTK\ComputationNode.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged>false</CompileAsManaged>
      <PrecompiledHeader>
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
@ -1,9 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
-    <ClCompile Include="..\..\Common\BestGpu.cpp" />
-    <ClCompile Include="dllmain.cpp" />
-    <ClCompile Include="stdafx.cpp" />
    <ClCompile Include="CNTKEval.cpp" />
    <ClCompile Include="..\..\Common\ConfigFile.cpp">
      <Filter>Common</Filter>
@ -20,15 +17,16 @@
    <ClCompile Include="..\..\Common\TimerUtility.cpp">
      <Filter>Common</Filter>
    </ClCompile>
-    <ClCompile Include="..\CNTK\ComputationNode.cpp" />
-    <ClCompile Include="..\CNTK\ComputationNetwork.cpp" />
-    <ClCompile Include="..\CNTK\ComputationNetworkBuilder.cpp" />
+    <ClCompile Include="dllmain.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="EvalReader.h" />
    <ClInclude Include="EvalWriter.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
    <ClInclude Include="CNTKEval.h" />
    <ClInclude Include="..\..\Common\Include\Eval.h">
      <Filter>Common\Include</Filter>
@ -48,6 +46,12 @@
    <ClInclude Include="..\..\Common\Include\Basics.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
@ -56,5 +60,8 @@
    <Filter Include="Common\Include">
      <UniqueIdentifier>{f3bf0104-8a08-40c9-a4d9-af8411c49669}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Misc">
+      <UniqueIdentifier>{3660ead9-4e83-4246-8f76-dd1fda8e2590}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
 </Project>
--- a/MachineLearning/CNTKSGDLib/CNTKSGDLib.vcxproj
+++ b/MachineLearning/CNTKSGDLib/CNTKSGDLib.vcxproj
@ -0,0 +1,213 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}</ProjectGuid>
+    <SccProjectName>
+    </SccProjectName>
+    <SccAuxPath>
+    </SccAuxPath>
+    <SccLocalPath>
+    </SccLocalPath>
+    <SccProvider>
+    </SccProvider>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CNTK</RootNamespace>
+    <ProjectName>CNTKSGDLib</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IncludePath>..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
+    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IncludePath>..\CNTKComputationNetworkLib;..\..\Math\Math;..\..\Common\Include;..\..\BrainScript;C:\Program Files (x86)\Microsoft SDKs\MPI\Include;$(CUDA_PATH)\include;$(VCInstallDir)include;$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64;$(SolutionDir)$(Platform)\$(Configuration);$(SolutionDir)..\Common\lib;$(VCInstallDir)lib\amd64;$(WindowsSDK_LibraryPath_x64);$(CUDA_PATH)\lib\$(Platform)</LibraryPath>
+    <CustomBuildAfterTargets>Build</CustomBuildAfterTargets>
+    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <OpenMPSupport>true</OpenMPSupport>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalIncludeDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>CNTKComputationNetworkLib.lib; CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <StackReserveSize>100000000</StackReserveSize>
+    </Link>
+    <PostBuildEvent>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Message>Copying NVidia GDK extension DLL to target folder</Message>
+    </PostBuildEvent>
+    <CustomBuildStep>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(TargetDir)config.txt;$(TargetDir)labels.txt;$(TargetDir)network.txt;$(TargetDir)NdlScript.txt</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <TreatOutputAsContent>true</TreatOutputAsContent>
+      <Message>Copy content files to target directory</Message>
+    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalIncludeDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>CNTKComputationNetworkLib.lib; CNTKMath.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <Profile>true</Profile>
+      <DelayLoadDLLs>CNTKMath.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
+      <AdditionalLibraryDirectories>"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Message>Copying NVidia GDK extension DLL to target folder</Message>
+    </PostBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>
+      </Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <TreatOutputAsContent>true</TreatOutputAsContent>
+      <Message>
+      </Message>
+    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\CrossProcessMutex.h" />
+    <ClInclude Include="..\..\Common\Include\basetypes.h" />
+    <ClInclude Include="..\..\Common\Include\Basics.h" />
+    <ClInclude Include="..\..\Common\Include\BestGpu.h" />
+    <ClInclude Include="..\..\Common\Include\commandArgUtil.h" />
+    <ClInclude Include="..\..\Common\Include\DataReader.h" />
+    <ClInclude Include="..\..\Common\Include\DataWriter.h" />
+    <ClInclude Include="..\..\Common\Include\File.h" />
+    <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="..\..\Common\Include\hostname.h" />
+    <ClInclude Include="..\..\Common\Include\minibatchsourcehelpers.h" />
+    <ClInclude Include="..\..\Common\Include\Platform.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
+    <ClInclude Include="..\..\Math\Math\CUDAPageLockedMemAllocator.h" />
+    <ClInclude Include="..\..\Math\Math\Matrix.h" />
+    <ClInclude Include="..\..\Math\Math\MatrixQuantizer.h" />
+    <ClInclude Include="..\..\Math\Math\QuantizedMatrix.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\CompositeComputationNodes.h" />
+    <ClInclude Include="..\CNTK\Profiler.h" />
+    <ClInclude Include="AllReduceDistGradAggregator.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetwork.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNode.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\ConvolutionalNodes.h" />
+    <ClInclude Include="DistGradHeader.h" />
+    <ClInclude Include="IComputationNetBuilder.h" />
+    <ClInclude Include="IDistGradAggregator.h" />
+    <ClInclude Include="MPIWrapper.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\DecoderNode.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\EvaluationCriterionNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\InputAndParamNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\LinearAlgebraNodes.h" />
+    <ClInclude Include="MultiNetworksSGD.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\NonlinearityNodes.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\RecurrentNodes.h" />
+    <ClInclude Include="SimpleEvaluator.h" />
+    <ClInclude Include="SimpleOutputWriter.h" />
+    <ClInclude Include="SGD.h" />
+    <ClInclude Include="SimpleNetworkBuilder.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="..\CNTKComputationNetworkLib\TrainingCriterionNodes.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\ConfigFile.cpp" />
+    <ClCompile Include="..\..\Common\DataReader.cpp" />
+    <ClCompile Include="..\..\Common\DataWriter.cpp" />
+    <ClCompile Include="..\..\Common\File.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
+    <ClCompile Include="Profiler.cpp" />
+    <ClCompile Include="SGD.cpp" />
+    <ClCompile Include="stdafx.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/MachineLearning/CNTKSGDLib/CNTKSGDLib.vcxproj.filters
+++ b/MachineLearning/CNTKSGDLib/CNTKSGDLib.vcxproj.filters
@ -0,0 +1,177 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\ConfigFile.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\DataReader.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\DataWriter.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="Profiler.cpp">
+      <Filter>GPU Interfacing</Filter>
+    </ClCompile>
+    <ClCompile Include="SGD.cpp">
+      <Filter>SGD</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\Include\basetypes.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\commandArgUtil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\fileutil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\File.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\DataReader.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\DataWriter.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNetwork.h">
+      <Filter>from CNTKComputationNetworkLib\Network</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ComputationNode.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\hostname.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\Basics.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\minibatchsourcehelpers.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\BestGpu.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\CompositeComputationNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\EvaluationCriterionNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\TrainingCriterionNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\NonlinearityNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\LinearAlgebraNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\ConvolutionalNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\RecurrentNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\InputAndParamNodes.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTKComputationNetworkLib\DecoderNode.h">
+      <Filter>from CNTKComputationNetworkLib\Nodes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\CrossProcessMutex.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="AllReduceDistGradAggregator.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
+    <ClInclude Include="DistGradHeader.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
+    <ClInclude Include="IDistGradAggregator.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
+    <ClInclude Include="MPIWrapper.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\Platform.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\CNTK\Profiler.h">
+      <Filter>GPU Interfacing</Filter>
+    </ClInclude>
+    <ClInclude Include="MultiNetworksSGD.h">
+      <Filter>SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SGD.h">
+      <Filter>SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleEvaluator.h">
+      <Filter>SGD</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleOutputWriter.h">
+      <Filter>Eval</Filter>
+    </ClInclude>
+    <ClInclude Include="SimpleNetworkBuilder.h">
+      <Filter>Eval</Filter>
+    </ClInclude>
+    <ClInclude Include="IComputationNetBuilder.h">
+      <Filter>SGD</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Common">
+      <UniqueIdentifier>{b3d05c7b-7bcf-4b12-bcb5-dced86717202}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common\Include">
+      <UniqueIdentifier>{85226dda-87ba-4da6-af04-563d0ce23b94}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Misc">
+      <UniqueIdentifier>{3c119a92-ffb2-4850-adae-01778324974d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU Interfacing">
+      <UniqueIdentifier>{8d99b2cc-5209-40e4-8b4b-a7616973ae3b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Parallelization">
+      <UniqueIdentifier>{8531d7fb-a673-491a-988a-012c92fafbfd}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="SGD">
+      <UniqueIdentifier>{5e22e394-50bb-4ce7-bfda-9b8d2d1a2741}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Eval">
+      <UniqueIdentifier>{c263e5cd-26a3-4277-bf2f-f3de466267a3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from CNTKComputationNetworkLib">
+      <UniqueIdentifier>{d5cc574b-5fd1-476b-b69e-0c6428a55262}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from CNTKComputationNetworkLib\Network">
+      <UniqueIdentifier>{498bb2e9-53de-4955-970e-813e3f21025b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from CNTKComputationNetworkLib\Nodes">
+      <UniqueIdentifier>{0b366814-48b2-4619-bf92-85ee24e3cbc1}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/MachineLearning/CNTKSGDLib/DistGradHeader.h
+++ b/MachineLearning/CNTKSGDLib/DistGradHeader.h
@ -2,21 +2,20 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    template<typename ElemType>
    struct DistGradHeader
    {
    public:
        size_t numSamples;
        size_t numSamplesWithLabel;
-        ElemType criterion;
+        double criterion;

        // variable-size array
        int numEvalNode;
-        ElemType evalErrors[1];
+        double evalErrors[1];

-        static DistGradHeader<ElemType>* Create(int numEvalNode)
+        static DistGradHeader* Create(int numEvalNode)
        {
-            DistGradHeader<ElemType>* header = (DistGradHeader<ElemType>*)new char[DistGradHeaderSize(numEvalNode)];
+            DistGradHeader* header = (DistGradHeader*)new char[DistGradHeaderSize(numEvalNode)];
            header->numEvalNode = numEvalNode;
            return header;
        }
@ -27,12 +26,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        //aggregate header information
-        void Aggregate(DistGradHeader<ElemType>* other, bool add = false)
+        void Aggregate(DistGradHeader* other, bool add = false)
        {
            if (other->numEvalNode != numEvalNode)
-            {
-                throw  std::runtime_error("mismatched size");
-            }
+                RuntimeError("mismatched size");
            if (!add)
            {
                memcpy((void*)this, (void*)other, DistGradHeaderSize(numEvalNode));
@ -57,7 +54,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    private:
        static size_t DistGradHeaderSize(size_t nEvalNode)
        {
-            return sizeof(DistGradHeader<ElemType>) + (sizeof(ElemType) * (nEvalNode - 1));
+            return sizeof(DistGradHeader)+(sizeof(double) * (nEvalNode - 1));
        }

        // Disallow construction and destruction since this type contains a variable sized array member
--- a/MachineLearning/CNTKSGDLib/IComputationNetBuilder.h
+++ b/MachineLearning/CNTKSGDLib/IComputationNetBuilder.h
--- a/MachineLearning/CNTKSGDLib/IDistGradAggregator.h
+++ b/MachineLearning/CNTKSGDLib/IDistGradAggregator.h
@ -18,7 +18,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
        }
        
-        virtual void AggregateGradients(DistGradHeader<ElemType> *headerCPU, int epochNumber) = 0;
+        virtual void AggregateGradients(DistGradHeader *headerCPU, int epochNumber) = 0;

        size_t NumProc()
        {
--- a/MachineLearning/CNTKSGDLib/MPIWrapper.h
+++ b/MachineLearning/CNTKSGDLib/MPIWrapper.h
@ -222,7 +222,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        // for raw pointer 
-        template<typename ElemType>
+        template<class ElemType>
        void AllReduce(ElemType* pData, size_t nData)
        {
            if ((NumNodesInUse() > 1 && (Communicator() != MPI_COMM_NULL)))
@ -231,7 +231,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }

-        template<typename ElemType>
+        template<class ElemType>
        void Bcast(ElemType* pData, size_t nData, size_t srcRank)
        {
            if ((NumNodesInUse() > 1) && (Communicator() != MPI_COMM_NULL))
--- a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
+++ b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
@ -5,10 +5,11 @@
 //
 #pragma once

+// TODO: this cannot be instantiated as a whole (compile error), although some function is called from CNTK.cpp--should be fixed
+
 #include "basetypes.h"
 #include "ComputationNetwork.h"
 #include "IComputationNetBuilder.h"
-#include "ComputationNetworkHelper.h"
 #include "SimpleEvaluator.h"
 #include "DataReader.h"
 #include <vector>
@ -68,8 +69,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        using SGDBase::m_prevChosenMinibatchSize;
        using SGDBase::GetTrainCriterionNodes;
        using SGDBase::GetEvalCriterionNodes;
-        using SGDBase::SetDropoutRate;
-        using SGDBase::UpdateEvalTimeStamps;
        using SGDBase::UpdateWeights;
        using SGDBase::GetCheckPointFileNameForEpoch;

@ -259,28 +258,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(), node->FunctionValues().GetDeviceId()));
            }

-            vector<ElemType> epochCriterion;
-            ElemType avgCriterion, prevCriterion;
+            vector<double> epochCriterion;
+            double avgCriterion, prevCriterion;
            for (size_t i = 0; i < 2; i++)
-                epochCriterion.push_back(std::numeric_limits<ElemType>::infinity());
-            avgCriterion = prevCriterion = std::numeric_limits<ElemType>::infinity();
+                epochCriterion.push_back(std::numeric_limits<double>::infinity());
+            avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();

            size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;

-            std::vector<ElemType> epochEvalErrors(decoderEvaluationNodes.size(), std::numeric_limits<ElemType>::infinity());
+            std::vector<double> epochEvalErrors(decoderEvaluationNodes.size(), std::numeric_limits<double>::infinity());

            std::vector<wstring> evalNodeNames;
            for (size_t i = 0; i<decoderEvaluationNodes.size(); i++)
                evalNodeNames.push_back(decoderEvaluationNodes[i]->NodeName());

            size_t totalSamplesSeen = 0;
-            ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
+            double learnRatePerSample = 0.5f / m_mbSize[startEpoch];

            int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
-            vector<ElemType> prevLearnRates;
+            vector<double> prevLearnRates;
            prevLearnRates.resize(m_numPrevLearnRates);
            for (int i = 0; i<m_numPrevLearnRates; i++)
-                prevLearnRates[i] = std::numeric_limits<ElemType>::infinity();
+                prevLearnRates[i] = std::numeric_limits<double>::infinity();

            //precompute mean and invStdDev nodes and save initial model
            if (/// to-do doesn't support pre-compute such as MVN here 
@ -299,7 +298,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                throw std::invalid_argument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");

            ULONG dropOutSeed = 1;
-            ElemType prevDropoutRate = 0;
+            double prevDropoutRate = 0;

            bool learnRateReduced = false;

@ -308,8 +307,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                auto t_start_epoch = clock();

                //set dropout rate
-                SetDropoutRate(*encoderNet, encoderEvaluationNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-                SetDropoutRate(*decoderNet, decoderCriterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+                ComputationNetwork::SetDropoutRate<ElemType>(*encoderNet, encoderEvaluationNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+                ComputationNetwork::SetDropoutRate<ElemType>(*decoderNet, decoderCriterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);

                //learning rate adjustment
                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
@ -339,7 +338,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {


                auto t_end_epoch = clock();
-                ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
+                double epochTime = 1.0*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);

                //                    fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
                fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Decoder Train Loss Per Sample = %.8g    ", i + 1, epochCriterion[0]);
@ -369,7 +368,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    cvDecoderSetTrainAndEvalNodes.push_back(decoderCriterionNodes[0]->NodeName());
                    cvDecoderSetTrainAndEvalNodes.push_back(decoderEvaluationNodes[0]->NodeName());

-                    vector<ElemType> vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
+                    vector<double> vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
                        encoderNet, decoderNet,
                        encoderValidationSetDataReader,
                        decoderValidationSetDataReader, cvEncoderSetTrainAndEvalNodes,
@ -382,14 +381,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                bool loadedPrevModel = false;
                size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
-                if (avgCriterion == std::numeric_limits<ElemType>::infinity())
+                if (avgCriterion == std::numeric_limits<double>::infinity())
                    avgCriterion = epochCriterion[0];
                else
                    avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion[0]) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);

                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
                {
-                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity())
                    {
                        if (m_loadBestModel)
                        {
@ -414,7 +413,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    if (m_continueReduce)
                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {
                            if (learnRateReduced == false)
                            {
@ -436,13 +435,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                    else
                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {

                            learnRatePerSample *= m_learnRateDecreaseFactor;
                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
                        }
-                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {
                            learnRatePerSample *= m_learnRateIncreaseFactor;
                            fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
@ -563,9 +562,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(), node->FunctionValues().GetDeviceId()));
            }

-            ElemType epochCriterion, avgCriterion, prevCriterion;
-            epochCriterion = std::numeric_limits<ElemType>::infinity();
-            avgCriterion = prevCriterion = std::numeric_limits<ElemType>::infinity();
+            double epochCriterion, avgCriterion, prevCriterion;
+            epochCriterion = std::numeric_limits<double>::infinity();
+            avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();

            size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;

@ -574,7 +573,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                iNumEvaluations += evaluationNodes[i]->size();
            }
-            std::vector<ElemType> epochEvalErrors(iNumEvaluations, std::numeric_limits<ElemType>::infinity());
+            std::vector<double> epochEvalErrors(iNumEvaluations, std::numeric_limits<double>::infinity());

            std::vector<wstring> evalNodeNames;
            for (size_t k = 0; k < iNumNetworks; k++)
@ -584,13 +583,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            size_t totalSamplesSeen = 0;
-            ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
+            double learnRatePerSample = 0.5f / m_mbSize[startEpoch];

            int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
-            vector<ElemType> prevLearnRates;
+            vector<double> prevLearnRates;
            prevLearnRates.resize(m_numPrevLearnRates);
            for (int i = 0; i<m_numPrevLearnRates; i++)
-                prevLearnRates[i] = std::numeric_limits<ElemType>::infinity();
+                prevLearnRates[i] = std::numeric_limits<double>::infinity();

            //precompute mean and invStdDev nodes and save initial model
            if (/// to-do doesn't support pre-compute such as MVN here 
@ -620,7 +619,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                throw std::invalid_argument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");

            ULONG dropOutSeed = 1;
-            ElemType prevDropoutRate = 0;
+            double prevDropoutRate = 0;

            bool learnRateReduced = false;

@ -632,9 +631,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (size_t k = 0; k < iNumNetworks; k++)
                {
                    if (evaluationNodes[k]->size() > 0)
-                        SetDropoutRate(*nets[k], (*evaluationNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+                        ComputationNetwork::SetDropoutRate<ElemType>(*nets[k], (*evaluationNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
                    if (criterionNodes[k]->size() > 0)
-                        SetDropoutRate(*nets[k], (*criterionNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+                        ComputationNetwork::SetDropoutRate<ElemType>(*nets[k], (*criterionNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
                }

                //learning rate adjustment
@ -670,7 +669,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {


                auto t_end_epoch = clock();
-                ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
+                double epochTime = 1.0*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);

                /**
                this is hacky. Only allow evaluatio on the first encoder->decoder pair
@ -700,7 +699,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    SimpleEvaluator<ElemType> evalforvalidation(*decoderNet);

-                    ElemType vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
+                    double vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
                        nets,
                        validationDataReader,
                        m_mbSize[i]);
@ -712,14 +711,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                bool loadedPrevModel = false;
                size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
-                if (avgCriterion == std::numeric_limits<ElemType>::infinity())
+                if (avgCriterion == std::numeric_limits<double>::infinity())
                    avgCriterion = epochCriterion;
                else
                    avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);

                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
                {
-                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity())
                    {
                        if (m_loadBestModel)
                        {
@ -739,7 +738,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    if (m_continueReduce)
                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {
                            if (learnRateReduced == false)
                            {
@ -764,13 +763,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                    else
                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {

                            learnRatePerSample *= m_learnRateDecreaseFactor;
                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
                        }
-                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<double>::infinity())
                        {
                            learnRatePerSample *= m_learnRateIncreaseFactor;
                            fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
@ -817,20 +816,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            vector<std::vector<ComputationNodeBasePtr>*> labelNodes,
            vector<std::vector<ComputationNodeBasePtr>*> criterionNodes,
            const std::list<ComputationNodeBasePtr>& learnableNodes,
-            const ElemType learnRatePerSample,
+            const double learnRatePerSample,
            std::list<Matrix<ElemType>>& smoothedGradients,
-            ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors, size_t& totalSamplesSeen)
+            double& epochCriterion, std::vector<double>& epochEvalErrors, size_t& totalSamplesSeen)
        {
            ComputationNetwork* encoderNet = nets[0];
            ComputationNetwork* decoderNet = nets[1];
            DEVICEID_TYPE device = encoderNet->GetDeviceID();
            Matrix<ElemType> historyMat(device);

-            ElemType readTimeInMBs = 0, ComputeTimeInMBs = 0;
-            ElemType epochCriterionLastMBs = 0;
+            double readTimeInMBs = 0, ComputeTimeInMBs = 0;
+            double epochCriterionLastMBs = 0;

            int numSamplesLastMBs = 0;
-            std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
+            std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);

            clock_t startReadMBTime = 0, startComputeMBTime = 0;
            clock_t endReadMBTime = 0, endComputeMBTime = 0;
@ -888,9 +887,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                for (size_t i = 0; i < iNumNetworks; i++)
                {
-                    UpdateEvalTimeStamps(*featureNodes[i]);
+                    ComputationNetwork::UpdateEvalTimeStamps(*featureNodes[i]);
                    if (labelNodes[i]->size() > 0)
-                        UpdateEvalTimeStamps(*labelNodes[i]);
+                        ComputationNetwork::UpdateEvalTimeStamps(*labelNodes[i]);
                }

                endReadMBTime = clock();
@ -942,8 +941,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                numMBsRun++;
                if (m_traceLevel > 0)
                {
-                    ElemType MBReadTime = (ElemType)(endReadMBTime - startReadMBTime) / (CLOCKS_PER_SEC);
-                    ElemType MBComputeTime = (ElemType)(endComputeMBTime - startComputeMBTime) / CLOCKS_PER_SEC;
+                    double MBReadTime = (double)(endReadMBTime - startReadMBTime) / (CLOCKS_PER_SEC);
+                    double MBComputeTime = (double)(endComputeMBTime - startComputeMBTime) / CLOCKS_PER_SEC;

                    readTimeInMBs += MBReadTime;
                    ComputeTimeInMBs += MBComputeTime;
@ -954,10 +953,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                        epochCriterion = localEpochCriterion.Get00Element();
                        for (size_t i = 0; i< numEvalNodes; i++)
-                            epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0, i);
+                            epochEvalErrors[i] = (const double)localEpochEvalErrors(0, i);

-                        ElemType llk = (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs;
-                        ElemType ppl = exp(llk);
+                        double llk = (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs;
+                        double ppl = exp(llk);
                        fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d   Decoder Train Loss Per Sample = %.8g PPL = %.4e ", epochNumber + 1, numMBsRun - m_numMBsToShowResult + 1, numMBsRun, numSamplesLastMBs,
                            llk, ppl);
                        for (size_t i = 0; i<numEvalNodes; i++){
@ -999,7 +998,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            epochCriterion = localEpochCriterion.Get00Element();
            for (size_t i = 0; i < numEvalNodes; i++)
            {
-                epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0, i);
+                epochEvalErrors[i] = localEpochEvalErrors(0, i);
            }
            fprintf(stderr, "total samples in epoch[%d] = %zd\n", epochNumber, totalEpochSamples);
        }
@ -1043,13 +1042,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        deviceId = node->FunctionValues().GetDeviceId();  // original device id

                        node->FunctionValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
-                        ElemType eOrg = node->FunctionValues()(irow, icol);  /// warning :: this function will put matrix into CPU
+                        double eOrg = node->FunctionValues()(irow, icol);  /// warning :: this function will put matrix into CPU
                        node->FunctionValues().TransferToDeviceIfNotThere(deviceId, true);

                        /// perturb parameter
-                        ElemType ePos = eOrg + (ElemType)EPSILON;
+                        double ePos = eOrg + EPSILON;
                        node->FunctionValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
-                        node->FunctionValues().SetValue(irow, icol, ePos);
+                        node->FunctionValues().SetValue(irow, icol, (ElemType)ePos);
                        node->FunctionValues().TransferToDeviceIfNotThere(deviceId, true);

                        node->UpdateEvalTimeStamp();
@ -1061,11 +1060,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            featureNodes, criterionNodes, 
                            localEpochCriterion, localEpochEvalErrors);

-                        ElemType score1 = localEpochCriterion.Get00Element();
+                        double score1 = localEpochCriterion.Get00Element();

-                        ElemType eNeg = eOrg - (ElemType)EPSILON;
+                        double eNeg = eOrg - EPSILON;
                        node->FunctionValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
-                        node->FunctionValues().SetValue(irow, icol, eNeg);
+                        node->FunctionValues().SetValue(irow, icol, (ElemType)eNeg);
                        node->FunctionValues().TransferToDeviceIfNotThere(deviceId, true);
                        node->UpdateEvalTimeStamp();
                        localEpochCriterion.SetValue(0);
@ -1076,12 +1075,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            featureNodes, criterionNodes, 
                            localEpochCriterion, localEpochEvalErrors);

-                        ElemType score1r = localEpochCriterion.Get00Element();
+                        double score1r = localEpochCriterion.Get00Element();

-                        ElemType grdNum = (score1r - score1) / (eNeg - ePos);
+                        double grdNum = (score1r - score1) / (eNeg - ePos);

                        node->FunctionValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
-                        node->FunctionValues().SetValue(irow, icol, eOrg);
+                        node->FunctionValues().SetValue(irow, icol, (ElemType)eOrg);
                        node->FunctionValues().TransferToDeviceIfNotThere(deviceId, true);
                        node->UpdateEvalTimeStamp();
                        localEpochCriterion.SetValue(0);
@ -1095,12 +1094,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        EncoderDecoderWithHiddenStatesErrorProp(nets, pairNodes, criterionNodes);

                        node->GradientValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
-                        ElemType grdErr = node->GradientValues()(irow, icol);
+                        double grdErr = node->GradientValues()(irow, icol);
                        node->GradientValues().TransferToDeviceIfNotThere(deviceId, true);

                        // check if they are consistent
-                        ElemType threshold = (ElemType)pow((ElemType)10.0, max((ElemType)0.0, ceil(log10(min(fabs(grdErr), fabs(grdNum))))) - (int)m_gradientCheckSigDigit);
-                        ElemType diff = (ElemType)fabs(grdErr - grdNum);
+                        double threshold = pow(10.0, max(0.0, ceil(log10(min(fabs(grdErr), fabs(grdNum))))) - (int)m_gradientCheckSigDigit);
+                        double diff = fabs(grdErr - grdNum);
                        bool wrong = (std::isnan(diff) || diff > threshold);
                        if (wrong)
                        {
@ -1185,7 +1184,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(decoderCriterionNodes[0])->FunctionValues(), 0, 0, localEpochCriterion, 0, 0);

                size_t numEvalNodes = decoderEvaluationNodes.size();
-                std::vector<ElemType>mbEvalErrors(numEvalNodes, 0);
+                std::vector<double>mbEvalErrors(numEvalNodes, 0);

                for (size_t i = 0; i < numEvalNodes; i++)
                {
--- a/MachineLearning/CNTKSGDLib/Profiler.cpp
+++ b/MachineLearning/CNTKSGDLib/Profiler.cpp
--- a/MachineLearning/CNTKSGDLib/Profiler.h
+++ b/MachineLearning/CNTKSGDLib/Profiler.h
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
--- a/MachineLearning/CNTKSGDLib/SGD.h
+++ b/MachineLearning/CNTKSGDLib/SGD.h
@ -0,0 +1,443 @@
+//
+// <copyright file="SGD.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include "Basics.h"
+#include "ComputationNetwork.h"
+#include "NonlinearityNodes.h"          // for DropoutNode
+#include "CompositeComputationNodes.h"  // for PrecomputeNode
+#include "SimpleEvaluator.h"
+#include "DataReader.h"
+#include "IComputationNetBuilder.h"
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include "fileutil.h"
+#include "commandArgUtil.h"
+#include <chrono> 
+#include <random>
+#include "TimerUtility.h"
+#include "Profiler.h"
+
+using namespace std;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+enum class LearningRateSearchAlgorithm : int
+{
+    None,
+    AdjustAfterEpoch,
+    SearchBeforeEpoch
+};
+
+enum class AdaptationRegType : int
+{
+    None,
+    KL
+};
+
+enum class GradientsUpdateType : int
+{
+    None,
+    AdaGrad,
+    RmsProp
+};
+
+// TODO: While currently combining these methods is not supported,
+// these are not mutually exclusive and we can/should support combinations of these
+// in the future
+enum class ParallelizationMethod : int
+{
+    None = 0,
+    DataParallelSGD = 1,
+    ModelAveragingSGD = (1 << 1),
+    ModelParallelSGD = (1 << 2), // Currently unsupported
+};
+
+// configuration parameters associated with RMSProp learning algorithm
+struct RMSPropInfo
+{
+    double gamma;
+    double inc;
+    double dec;
+    double max;
+    double min;
+
+    RMSPropInfo()
+    {
+        gamma = 0.99;
+        inc = 1.2;
+        dec = 0.75;
+        max = 10.0;
+        min = 0.1;
+    }
+};
+
+struct GradientUpdateInfo
+{
+    GradientsUpdateType mType;
+    float mGaussianNoiseInjectStd;
+
+    GradientUpdateInfo()
+    {
+        mType = GradientsUpdateType::AdaGrad;
+        mGaussianNoiseInjectStd = 0.0075f;
+    }
+};
+
+template<class ElemType> class IDistGradAggregator;
+
+// TODO: make this independent of ElemType. Then these repeated dynamic_pointer_casts will go away
+// TODO: why is this a class, and not just a procedure? Then we wouldn't have to include the massive header
+template<class ElemType>
+class SGD
+{
+protected:
+    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
+    typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
+
+public:
+    SGD(const ConfigParameters& configSGD);
+
+    //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
+    void Init(const floatargvector& learningRatesPerMB,
+              const floatargvector& learningRatesPerSample,
+              const intargvector& mbSize,
+              const size_t epochSize,
+              const size_t maxEpochs,
+              const wstring& modelPath,
+              const floatargvector& momentumPerMB,
+              const floatargvector& momentumPerSample,
+              const bool gradientClippingWithTruncation,
+              const double clippingThresholdPerSample,
+              const LearningRateSearchAlgorithm autoLearnRateSearchType,
+              const double increaseLearnRateIfImproveMoreThan,
+              const double learnRateIncreaseFactor,
+              const double reduceLearnRateIfImproveLessThan,
+              const bool continueReduce,
+              const double learnRateDecreaseFactor,
+              floatargvector dropoutRates,
+              const bool loadBestModel,
+              const intargvector& numMiniBatch4LRSearch,
+              const size_t numPrevLearnRates,
+              const size_t numBestSearchEpoch,
+              const int traceLevel,
+              const size_t numMBsToShowResult,
+              const size_t numMBsToCUDAProfile,
+              const size_t maxTempMemSizeInSamplesForCNN,
+              const GradientUpdateInfo gradUpdateType,
+              const bool keepCheckPointFiles,
+              const AdaptationRegType adaptationRegType,
+              const double adaptationRegWeight,
+              const wstring trainCriterionNodeName,
+              const wstring evalCriterionNodeName,
+              const bool doGradientCheck,
+              const double gradientCheckSigDigit,
+              const bool validateAfterModelReloading,
+              RMSPropInfo rpi,
+              size_t learnRateAdjustInterval,
+              const bool UsingAllDataForPreComputed,
+              const bool needAveMultiplier,
+              const double L2RegWeight,
+              const double L1RegWeight,
+              const bool autoAdjustMinibatch,
+              const size_t minibatchSizeTuningFrequency,
+              const size_t minibatchSizeTuningMax,
+              const bool useCVSetControlLRIfCVExists,
+              const bool useEvalCriterionControlLR,
+              const size_t minibatchSearchCriterionErrorMargin);
+
+    void Adapt(wstring origModelFileName, wstring refNodeName,
+               IDataReader<ElemType>* trainSetDataReader,
+               IDataReader<ElemType>* validationSetDataReader,
+               const DEVICEID_TYPE deviceID, const bool makeMode = true);
+    void SequenceTrain(IComputationNetBuilder<ElemType>* netBuilder, wstring origModelFileName,
+                       IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader,
+                       const DEVICEID_TYPE deviceID, const bool makeMode = true);
+    void Train(IComputationNetBuilder<ElemType>* netBuilder,
+        IDataReader<ElemType>* trainSetDataReader,
+        IDataReader<ElemType>* validationSetDataReader,
+        const bool makeMode = true);
+
+protected:
+    std::vector<ComputationNodeBasePtr> & GetTrainCriterionNodes(ComputationNetwork& net);
+    std::vector<ComputationNodeBasePtr> & GetEvalCriterionNodes(ComputationNetwork& net);
+
+    void TrainOrAdaptModel(int startEpoch, ComputationNetwork& net,
+                           ComputationNetwork& refNet,
+                           ComputationNodeBasePtr refNode,
+                           IDataReader<ElemType>* trainSetDataReader,
+                           IDataReader<ElemType>* validationSetDataReader);
+
+protected:
+    // return true if precomputation is executed.
+    bool PreCompute(ComputationNetwork& net,
+                    IDataReader<ElemType>* trainSetDataReader,
+                    std::vector<ComputationNodeBasePtr> & featureNodes,
+                    std::vector<ComputationNodeBasePtr> & labelNodes,
+                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices);
+
+    // return a reasonable initial learning rate based on the initial mbsize
+    double SearchForBestLearnRate(ComputationNetwork& net,
+                                  ComputationNetwork& refNet,
+                                  const ComputationNodeBasePtr refNode, const int epochNumber,
+                                  const double curLearnRate,
+                                  IDataReader<ElemType>* trainSetDataReader,
+                                  const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                  const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                  const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                  const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                  std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                  const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                  std::list<Matrix<ElemType>>& smoothedGradients,
+                                  const bool learnRateInitialized,
+                                  const double largestPrevLearnRatePerSample);
+
+    void TrainOneMiniEpochAndReloadModel(ComputationNetwork& net,
+                                         ComputationNetwork& refNet,
+                                         const ComputationNodeBasePtr refNode, const int epochNumber,
+                                         const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
+                                         const double learnRatePerSample,
+                                         const size_t minibatchSize,
+                                         const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                         const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                         const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                         std::list<Matrix<ElemType>>& smoothedGradients,
+                                         /*out*/ double& epochCriterion,
+                                         /*out*/ std::vector<double>& epochEvalErrors,
+                                         /*out*/ size_t& totalSamplesSeen,
+                                         std::string prefixMsg = "");
+
+    size_t AdaptiveMinibatchSizing(ComputationNetwork& net,
+                                   ComputationNetwork& refNet,
+                                   const ComputationNodeBasePtr refNode,
+                                   const int epochNumber,
+                                   const size_t numFramesToUseInSearch,
+                                   IDataReader<ElemType>* trainSetDataReader,
+                                   const double learnRatePerSample,
+                                   const size_t initialMinibatchSize,
+                                   const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                   const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                   const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                   const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                   std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                   const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                   std::list<Matrix<ElemType>>& smoothedGradients,
+                                   const double learningRateAdjustmentFactor);
+
+    // uses a small percentage of training data of minibatch to
+    // speculatively train with various MB sizes; then picks the best
+    size_t SearchForBestMinibatchSize(ComputationNetwork& net,
+                                      ComputationNetwork& refNet,
+                                      const ComputationNodeBasePtr refNode,
+                                      const int epochNumber,
+                                      const size_t numFramesToUseInSearch,
+                                      IDataReader<ElemType>* trainSetDataReader,
+                                      const double learnRatePerSample,
+                                      const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                      const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                      const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                      const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                      std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                      const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                      std::list<Matrix<ElemType>>& smoothedGradients,
+                                      const size_t minMinibatchSize, const size_t maxMinibatchSize);
+
+    // Tries to compute derivatives for the whole utterances, which will be
+    // fed to the neural network as features.
+    void AttemptUtteranceDerivativeFeatures(ComputationNetwork& net,
+                                            IDataReader<ElemType>* trainSetDataReader,
+                                            const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                            std::map<std::wstring, Matrix<ElemType>*>* inputMatrices);
+
+    size_t TrainOneEpoch(ComputationNetwork& net,
+                         ComputationNetwork& refNet,
+                         const ComputationNodeBasePtr refNode,
+                         const int epochNumber,
+                         const size_t epochSize,
+                         IDataReader<ElemType>* trainSetDataReader,
+                         const double learnRatePerSample,
+                         size_t tunedMBSize,
+                         const std::vector<ComputationNodeBasePtr> & featureNodes,
+                         const std::vector<ComputationNodeBasePtr> & labelNodes,
+                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                         const std::list<ComputationNodeBasePtr> & learnableNodes,
+                         std::list<Matrix<ElemType>>& smoothedGradients,
+                         /*out*/ double& epochCriterion,
+                         /*out*/ std::vector<double>& epochEvalErrors,
+                         /*out*/ size_t& totalSamplesSeen,
+                         std::string prefixMsg = "");
+
+    void LazyInitDistGradAgg(const std::list<ComputationNodeBasePtr>& learnableNodes, int numEvalNodes, int traceLevel);
+
+    bool ModelAveragingProcessing(size_t nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes, size_t& nProcessedFrames, 
+                                  float& SecondsSinceLastSyncFinished, float& SecondsSpentOnSync);
+
+    size_t ModelAveragingSync(int nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes);
+
+public:
+    // UpdateWeightsS - static version of UpdateWeights()
+    static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues,
+                               Matrix<ElemType>& gradientValues,
+                               Matrix<ElemType>& smoothedGradient,
+                               const double learnRatePerSample,
+                               const double momentumPerSample,
+                               size_t actualMBSize,
+                               const double L2RegWeight,
+                               const double L1RegWeight,
+                               const bool needAveMultiplier);
+
+protected:
+    // UpdateWeights - update the weights in
+    void UpdateWeights(const ComputationNodeBasePtr node,
+                       Matrix<ElemType>& smoothedGradient,
+                       const double learnRatePerSample,
+                       const double momentumPerSample,
+                       const size_t actualMBSize,
+                       const double L2RegWeight, const double L1RegWeight,
+                       const bool needAveMultiplier) const;
+
+    void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
+
+    void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
+                            const double learnRatePerSample,
+                            const std::list<Matrix<ElemType>>& smoothedGradients,
+                            const double prevCriterion,
+                            const size_t minibatchSize);
+
+    bool LoadCheckPointInfo(const size_t epochNumber,
+                            /*out*/ size_t& totalSamplesSeen,
+                            /*out*/ double& learnRatePerSample,
+                            std::list<Matrix<ElemType>>& smoothedGradients,
+                            /*out*/ double& prevCriterion,
+                            /*out*/ size_t& minibatchSize);
+
+    wstring GetCheckPointFileNameForEpoch(const int epoch);
+    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false);
+
+    // return -1 if nothing exists
+    int DetermineStartEpoch(const bool makeMode);
+
+    GradientsUpdateType GradUpdateType() const { return m_gradType.mType; }
+    double GradientUpdateNoiseStd() const { return m_gradType.mGaussianNoiseInjectStd; }
+
+public:
+
+#define EPSILON 1e-5
+
+    bool GradientCheck(ComputationNetwork& net,
+                       const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                       const std::list<ComputationNodeBasePtr> & learnableNodes,
+                       int npos);
+
+protected:
+
+    // learning rate per sample provided outside
+    floatargvector m_learningRatesPerSample;
+
+    // only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
+    bool m_needToNormalizeLRByParallUtterance;
+    bool m_needToNormalizeMomentumByParallUtterance;
+
+    intargvector m_mbSize;
+
+    // the number of samples in each epoch (0 means, use all the samples in each epoch).
+    size_t m_epochSize;
+
+    // the total number of epochs to run.
+    size_t m_maxEpochs;
+
+    floatargvector m_momentumPerSample;
+    bool m_gradientClippingWithTruncation;
+    double m_clippingThresholdPerSample;
+
+    wstring m_modelPath;
+    wstring m_trainCriterionNodeName;
+    wstring m_evalCriterionNodeName;
+
+    intargvector m_numMiniBatch4LRSearch;
+    size_t m_numBestSearchEpoch;
+
+    LearningRateSearchAlgorithm m_autoLearnRateSearchType;
+
+    AdaptationRegType m_adaptationRegType;
+    double m_adaptationRegWeight;
+    bool m_needAdaptRegularization;
+
+    bool m_loadBestModel;
+    double m_reduceLearnRateIfImproveLessThan;
+    bool m_continueReduce;
+
+    // determine after how many epochs the learning rate should be auto adjusted.
+    size_t m_learnRateAdjustInterval;
+
+    bool m_useCVSetControlLRIfCVExists;
+    bool m_useEvalCriterionControlLR;
+
+    double m_increaseLearnRateIfImproveMoreThan;
+    double m_learnRateIncreaseFactor;
+    double m_learnRateDecreaseFactor;
+    size_t m_prevChosenMinibatchSize;
+    bool m_autoAdjustMinibatch;
+    size_t m_minibatchSearchCriterionErrorMargin;
+    size_t m_minibatchSizeTuningFrequency;
+    size_t m_minibatchSizeTuningMax;
+
+    floatargvector m_dropoutRates;
+    size_t m_maxTempMemSizeInSamplesForCNN;
+
+    int m_traceLevel;
+
+    size_t m_numPrevLearnRates;
+
+    double m_minLearnRate;
+
+    GradientUpdateInfo m_gradType;
+    RMSPropInfo m_rpi;
+
+    bool m_keepCheckPointFiles;
+
+    int m_numMBsToShowResult;
+    int m_numMBsToCUDAProfile;
+
+    bool m_doGradientCheck;
+    double m_gradientCheckSigDigit;
+
+    bool m_doUnitTest;
+
+    bool m_validateAfterModelReloading;
+
+    bool m_useAllDataForPreComputedNode;
+
+    // Parallel training
+    ParallelizationMethod m_parallelizationMethod;
+    IDistGradAggregator<ElemType>* m_distGradAgg;
+    struct DistGradHeader* m_gradHeader;
+    int m_numGradientBits;
+    bool m_zeroThresholdFor1Bit;
+    bool m_enableDistributedMBReading;
+    int m_parallelizationStartEpochNum;
+
+    // Parallel training related with MA 
+    // decide how much information we want to show MA performance stats (seconds spend on sync, seconds since last sync etc.) ?  
+    // 0: means no perfomance stats show
+    // 1: means show stats every sync 
+    // n>1: means show stats after every n sync
+    int    m_iMASyncStatsTrace;
+    size_t m_nFramesBetweenMASync;
+
+    bool m_needAveMultiplier;
+    double m_L2RegWeight;
+    double m_L1RegWeight;
+
+};
+
+}}}
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@ -12,11 +12,11 @@
 #include <fstream>
 #include <queue>
 #include "Basics.h"
+#include "Helpers.h"    // for foreach_column() macro
 #include "fileutil.h"
 #include "DataReader.h"
 #include "DataWriter.h"
 #include "ComputationNetwork.h"
-#include "ComputationNetworkHelper.h"
 #include "TrainingCriterionNodes.h" // TODO: we should move the functions that depend on these to the .cpp
 #include "CompositeComputationNodes.h"

@ -25,29 +25,30 @@ using namespace std;
 namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType>
-    struct NN_state {
+    struct NN_state
+    {
        map<wstring, Matrix<ElemType>> hidden_activity;
    };

    template<class ElemType>
-    struct Token{
-        Token(const ElemType score, const std::vector<size_t> &sequence, const NN_state<ElemType> & state)
-        : score(score), sequence(sequence), state(state) {
-        }
-        bool operator<(const Token &t) const {
+    struct Token
+    {
+        Token(const double score, const std::vector<size_t> &sequence, const NN_state<ElemType> & state) :
+            score(score), sequence(sequence), state(state)
+        { }
+        bool operator<(const Token<ElemType> &t) const
+        {
            return score < t.score;
        }
-        ElemType score;
+        double score;
        vector<size_t> sequence;
        NN_state<ElemType> state;
    };

    // TODO: get rid of dependency on ElemType
    template<class ElemType>
-    class SimpleEvaluator : ComputationNetworkHelper<ElemType>
+    class SimpleEvaluator
    {
-        typedef ComputationNetworkHelper<ElemType> B;
-        using B::UpdateEvalTimeStamps;
    protected:
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
@ -64,7 +65,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
-        vector<ElemType> Evaluate(IDataReader<ElemType>* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
+        vector<double> Evaluate(IDataReader<ElemType>* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
        {
            //specify evaluation nodes
            std::vector<ComputationNodeBasePtr> evalNodes;
@ -94,11 +95,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            //initialize eval results
-            std::vector<ElemType> evalResults;
+            std::vector<double> evalResults;
            for (int i = 0; i < evalNodes.size(); i++)
-            {
-                evalResults.push_back((ElemType)0);
-            }
+                evalResults.push_back((double)0);

            //prepare features and labels
            auto & featureNodes = m_net.FeatureNodes();
@ -117,7 +116,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t numSamplesLastMBs = 0;
            size_t lastMBsRun = 0; //MBs run before this display

-            std::vector<ElemType> evalResultsLastMBs;
+            std::vector<double> evalResultsLastMBs;
            for (int i = 0; i < evalResults.size(); i++)
                evalResultsLastMBs.push_back((ElemType)0);

@ -125,8 +124,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            while (dataReader->GetMinibatch(inputMatrices))
            {
-                UpdateEvalTimeStamps(featureNodes);
-                UpdateEvalTimeStamps(labelNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(labelNodes);

                actualMBSize = m_net.GetActualMBSize();
                m_net.SetActualMiniBatchSize(actualMBSize);
@ -140,7 +139,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (int i = 0; i<evalNodes.size(); i++)
                {
                    m_net.Evaluate(evalNodes[i]);
-                    evalResults[i] += (ElemType)evalNodes[i]->Get00Element(); //criterionNode should be a scalar
+                    evalResults[i] += (double)evalNodes[i]->Get00Element(); //criterionNode should be a scalar
                }

                totalEpochSamples += numSamplesWithLabel;
@ -192,7 +191,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        //returns error rate
-        ElemType EvaluateUnroll(IDataReader<ElemType>* dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
+        double EvaluateUnroll(IDataReader<ElemType>* dataReader, const size_t mbSize, double &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
        {
            std::vector<ComputationNodeBasePtr> & featureNodes = m_net.FeatureNodes();
            std::vector<ComputationNodeBasePtr> & labelNodes = m_net.LabelNodes();
@ -213,16 +212,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            dataReader->StartMinibatchLoop(mbSize, 0, testSize);

-            ElemType epochEvalError = 0;
-            ElemType epochCrossEntropy = 0;
+            double epochEvalError = 0;
+            double epochCrossEntropy = 0;
            size_t totalEpochSamples = 0;
-            ElemType prevEpochEvalError = 0;
-            ElemType prevEpochCrossEntropy = 0;
+            double prevEpochEvalError = 0;
+            double prevEpochCrossEntropy = 0;
            size_t prevTotalEpochSamples = 0;
            size_t prevStart = 1;
            size_t numSamples = 0;
-            ElemType crossEntropy = 0;
-            ElemType evalError = 0;
+            double crossEntropy = 0;
+            double evalError = 0;

            ofstream outputStream;
            if (output)
@ -250,10 +249,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    m_net.Evaluate(evaluationNodes[npos]);

-                    ElemType mbCrossEntropy = (ElemType)criterionNodes[npos]->Get00Element(); // criterionNode should be a scalar
+                    double mbCrossEntropy = (double)criterionNodes[npos]->Get00Element(); // criterionNode should be a scalar
                    epochCrossEntropy += mbCrossEntropy;

-                    ElemType mbEvalError = (ElemType)evaluationNodes[npos]->Get00Element(); //criterionNode should be a scalar
+                    double mbEvalError = (double)evaluationNodes[npos]->Get00Element(); //criterionNode should be a scalar

                    epochEvalError += mbEvalError;
                }
@ -301,8 +300,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            //final statistics
-            epochEvalError /= (ElemType)totalEpochSamples;
-            epochCrossEntropy /= (ElemType)totalEpochSamples;
+            epochEvalError /= (double)totalEpochSamples;
+            epochCrossEntropy /= (double)totalEpochSamples;
            fprintf(stderr, "Overall: Samples Evaluated = %lu   EvalErr Per Sample = %.8g   Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
            if (outputStream.is_open())
            {
@ -315,11 +314,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    protected:
        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs,
            const vector<ComputationNodeBasePtr>& evalNodes,
-            const ElemType evalResults, const ElemType evalResultsLastMBs, bool displayConvertedValue = false)
+            const double evalResults, const double evalResultsLastMBs, bool displayConvertedValue = false)
        {
-            vector<ElemType> evaR;
+            vector<double> evaR;
            evaR.push_back(evalResults);
-            vector<ElemType> evaLast;
+            vector<double> evaLast;
            evaLast.push_back(evalResultsLastMBs);

            DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastMBs, evalNodes, evaR, evaLast, displayConvertedValue);
@ -327,22 +326,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodeBasePtr>& evalNodes,
-                                    const vector<ElemType> & evalResults, const vector<ElemType> & evalResultsLastMBs, bool displayConvertedValue = false)
+                                    const vector<double> & evalResults, const vector<double> & evalResultsLastMBs, bool displayConvertedValue = false)
        {
            fprintf(stderr, "Minibatch[%lu-%lu]: Samples Seen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);

            for (size_t i = 0; i < evalResults.size(); i++)
            {
-                ElemType eresult = (evalResults[i] - evalResultsLastMBs[i]) / numSamplesLastMBs;
+                double eresult = (evalResults[i] - evalResultsLastMBs[i]) / numSamplesLastMBs;
                fprintf(stderr, "%ls: %ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), evalNodes[i]->OperationName().c_str(), eresult);

                if (displayConvertedValue)
                {
                    //display Perplexity as well for crossEntropy values
-                    if (evalNodes[i]->OperationName() == CrossEntropyWithSoftmaxNode<ElemType>::TypeName() ||
-                        evalNodes[i]->OperationName() == CrossEntropyNode<ElemType>::TypeName() ||
-                        evalNodes[i]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName() ||
-                        evalNodes[i]->OperationName() == NoiseContrastiveEstimationNode<ElemType>::TypeName())
+                    if (evalNodes[i]->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode) ||
+                        evalNodes[i]->OperationName() == OperationNameOf(CrossEntropyNode) ||
+                        evalNodes[i]->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
+                        evalNodes[i]->OperationName() == OperationNameOf(NoiseContrastiveEstimationNode))
                        fprintf(stderr, "Perplexity = %.8g    ", std::exp(eresult));
                }
            }
@ -372,7 +371,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        this evaluates encoder network and decoder framework
        only beam search decoding is applied to the last network
        */
-        ElemType EvaluateEncoderDecoderWithHiddenStates(
+        double EvaluateEncoderDecoderWithHiddenStates(
            vector<ComputationNetwork*> nets,
            vector<IDataReader<ElemType>*> dataReaders,
            const size_t mbSize,
@ -386,7 +385,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            const auto & decoderEvaluationNodes = decoderNet->EvaluationNodes();

-            ElemType evalResults = 0;
+            double evalResults = 0;

            vector<std::map<std::wstring, Matrix<ElemType>*>*> inputMatrices;
            for (auto ptr = nets.begin(); ptr != nets.end(); ptr++)
@ -412,7 +411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t numSamplesLastMBs = 0;
            size_t lastMBsRun = 0; //MBs run before this display

-            ElemType evalResultsLastMBs = (ElemType)0;
+            double evalResultsLastMBs = (double)0;

            for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++)
            {
@ -440,7 +439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (auto ptr = nets.begin(); ptr != nets.end(); ptr++)
                {
                    const auto & featNodes = (*ptr)->FeatureNodes();
-                    UpdateEvalTimeStamps(featNodes);
+                    ComputationNetwork::UpdateEvalTimeStamps(featNodes);
                }

                auto preader = dataReaders.begin();
@ -481,7 +480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    if ((*ptr)->GetNumRows() != 1 || (*ptr)->GetNumCols() != 1)
                        LogicError("EvaluateEncoderDecoderWithHiddenStates: decoder evaluation should return a scalar value");

-                    evalResults += (ElemType)(*ptr)->Get00Element();
+                    evalResults += (double)(*ptr)->Get00Element();
                }

                totalEpochSamples += actualMBSize;
@ -575,7 +574,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            IDataWriter<ElemType>& dataWriter,
            const vector<wstring>& evalNodeNames,
            const vector<wstring>& writeNodeNames,
-            const size_t mbSize, const ElemType beam, const size_t testSize)
+            const size_t mbSize, const double beam, const size_t testSize)
        {
            size_t iNumNets = nets.size();
            if (iNumNets < 2)
@ -655,7 +654,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    /// only on the encoder part of the networks
                    const auto & featNodes = (*ptr)->FeatureNodes();
-                    UpdateEvalTimeStamps(featNodes);
+                    ComputationNetwork::UpdateEvalTimeStamps(featNodes);
                }


@ -695,30 +694,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        bool GetCandidatesAtOneTimeInstance(const Matrix<ElemType>& score,
-                                            const ElemType & preScore, const ElemType & threshold,
-                                            const ElemType& best_score_so_far,
-                                            vector<pair<int, ElemType>>& rCandidate)
+                                            const double & preScore, const double & threshold,
+                                            const double& best_score_so_far,
+                                            vector<pair<int, double>>& rCandidate)
        {
            Matrix<ElemType> ptrScore(CPUDEVICE);
            ptrScore = score;

            ElemType *pPointer = ptrScore.BufferPointer();
-            vector<pair<int, ElemType>> tPairs;
+            vector<pair<int, double>> tPairs;
            for (int i = 0; i < ptrScore.GetNumElements(); i++)
            {
                tPairs.push_back(make_pair(i, pPointer[i]));
                //                    assert(pPointer[i] <= 1.0); /// work on the posterior probabilty, so every score should be smaller than 1.0
            }

-            std::sort(tPairs.begin(), tPairs.end(), comparator<ElemType>);
+            std::sort(tPairs.begin(), tPairs.end(), comparator<double>);

            bool bAboveThreshold = false;
-            for (typename vector<pair<int, ElemType>>::iterator itr = tPairs.begin(); itr != tPairs.end(); itr++)
+            for (typename vector<pair<int, double>>::iterator itr = tPairs.begin(); itr != tPairs.end(); itr++)
            {
                if (itr->second < 0.0)
                    LogicError("This means to use probability so the value should be non-negative");

-                ElemType dScore = (itr->second >(ElemType)EPS_IN_LOG) ? log(itr->second) : (ElemType)LOG_OF_EPS_IN_LOG;
+                double dScore = (itr->second >(double)EPS_IN_LOG) ? log(itr->second) : (double)LOG_OF_EPS_IN_LOG;

                dScore += preScore;
                if (dScore >= threshold && dScore >= best_score_so_far)
@ -770,7 +769,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                return false;
            }

-            UpdateEvalTimeStamps(featureNodes);
+            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

            size_t actualMBSize = net.GetActualMBSize();
            net.SetActualMiniBatchSize(actualMBSize);
@ -809,7 +808,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            dataWriter.SaveData(nidx, outputMatrices, bSize, bSize, 0);
        }

-        void BeamSearch(IDataReader<ElemType>* dataReader, IDataWriter<ElemType>& dataWriter, const vector<wstring>& outputNodeNames, const vector<wstring>& writeNodeNames, const size_t mbSize, const ElemType beam, const size_t testSize)
+        void BeamSearch(IDataReader<ElemType>* dataReader, IDataWriter<ElemType>& dataWriter, const vector<wstring>& outputNodeNames, const vector<wstring>& writeNodeNames, const size_t mbSize, const double beam, const size_t testSize)
        {
            clock_t startReadMBTime = 0, endComputeMBTime = 0;

@ -842,10 +841,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            startReadMBTime = clock();
            size_t numMBsRun = 0;
-            ElemType ComputeTimeInMBs = 0;
+            double ComputeTimeInMBs = 0;
            while (dataReader->GetMinibatch(inputMatrices))
            {
-                UpdateEvalTimeStamps(featureNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

                actualMBSize = m_net.GetActualMBSize();
                m_net.SetActualMiniBatchSize(actualMBSize);
@ -868,7 +867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                if (m_traceLevel > 0)
                {
-                    ElemType MBComputeTime = (ElemType)(endComputeMBTime - startReadMBTime) / CLOCKS_PER_SEC;
+                    double MBComputeTime = (double)(endComputeMBTime - startReadMBTime) / CLOCKS_PER_SEC;

                    ComputeTimeInMBs += MBComputeTime;

@ -886,7 +885,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                          const std::vector<ComputationNodeBasePtr>& evalNodes,
                          const std::vector<ComputationNodeBasePtr>& outputNodes,
                          /*const*/ std::vector<ComputationNodeBasePtr>& featureNodes,
-                          const ElemType beam,
+                          const double beam,
                          std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
                          vector<size_t> &best_path)
        {
@ -902,7 +901,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            evaluate their scores, save their histories
            */
            priority_queue<Token<ElemType>> from_queue, to_queue;
-            vector<ElemType> evalResults;
+            vector<double> evalResults;

            size_t mbSize;
            mbSize = evalnet->GetActualMBSize();
@ -938,7 +937,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            for (itdx = 0; itdx < maxSize; itdx++)
            {
-                ElemType best_score = -numeric_limits<ElemType>::infinity();
+                double best_score = -numeric_limits<double>::infinity();
                vector<size_t> best_output_label;

                if (itdx > 0)
@ -954,7 +953,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    vector<size_t> history = from_token.sequence;

                    /// update feature nodes once, as the observation is the same for all propsoals in labels
-                    UpdateEvalTimeStamps(featureNodes);
+                    ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

                    /// history is updated in the getproposalobs function
                    dataReader->GetProposalObs(inputMatrices, itdx, history);
@ -966,13 +965,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    for (int i = 0; i < evalNodes.size(); i++)
                    {
                        evalnet->Evaluate(evalNodes[i]);
-                        vector<pair<int, ElemType>> retPair;
-                        if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast<ComputationNode<ElemType>>(evalNodes[i])->FunctionValues(), from_token.score, best_score - beam, -numeric_limits<ElemType>::infinity(), retPair)
+                        vector<pair<int, double>> retPair;
+                        if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast<ComputationNode<ElemType>>(evalNodes[i])->FunctionValues(), from_token.score, best_score - beam, -numeric_limits<double>::infinity(), retPair)
                            == false)
                            continue;

                        evalnet->GetHistory(state.hidden_activity, true);
-                        for (typename vector<pair<int, ElemType>>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
+                        for (typename vector<pair<int, double>>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
                        {
                            vector<size_t> history = from_token.sequence;
                            history.push_back(itr->first);
@ -997,7 +996,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    break;

                // beam pruning
-                const ElemType threshold = best_score - beam;
+                const double threshold = best_score - beam;
                while (!to_queue.empty())
                {
                    if (to_queue.top().score >= threshold)
@ -1036,14 +1035,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        /**
            beam search decoder
            */
-        ElemType FindBestPathWithVariableLength(ComputationNetwork* evalnet,
+        double FindBestPathWithVariableLength(ComputationNetwork* evalnet,
            size_t inputLength,
            IDataReader<ElemType>* dataReader,
            IDataWriter<ElemType>& dataWriter,
            std::vector<ComputationNodeBasePtr>& evalNodes,
            std::vector<ComputationNodeBasePtr>& outputNodes,
            std::vector<ComputationNodeBasePtr>& featureNodes,
-            const ElemType beam,
+            const double beam,
            std::map<std::wstring, Matrix<ElemType>*> * inputMatrices,
            vector<size_t> &best_path)
        {
@ -1060,7 +1059,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            */
            std::priority_queue<Token<ElemType>> from_queue, to_queue;
            std::priority_queue<Token<ElemType>> result_queue;
-            vector<ElemType> evalResults;
+            vector<double> evalResults;

            size_t mbSize = inputLength;
            size_t maxMbSize = 3 * mbSize;
@ -1093,14 +1092,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            /// is the begining of sentence
            evalnet->SetActualMiniBatchSize(dataReader->NumberSlicesInEachRecurrentIter());

-            ElemType best_score = -numeric_limits<ElemType>::infinity();
-            ElemType best_score_so_far = -numeric_limits<ElemType>::infinity();
+            double best_score = -numeric_limits<double>::infinity();
+            double best_score_so_far = -numeric_limits<double>::infinity();

            evalnet->SentenceBoundary().SetValue(SEQUENCE_START);

            for (itdx = 0; itdx < maxMbSize; itdx++)
            {
-                ElemType best_score = -numeric_limits<ElemType>::infinity();
+                double best_score = -numeric_limits<double>::infinity();
                vector<size_t> best_output_label;

                if (itdx > 0)
@ -1116,7 +1115,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    vector<size_t> history = from_token.sequence;

                    /// update feature nodes once, as the observation is the same for all propsoals in labels
-                    UpdateEvalTimeStamps(featureNodes);
+                    ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

                    /// history is updated in the getproposalobs function
                    dataReader->GetProposalObs(inputMatrices, itdx, history);
@ -1128,14 +1127,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    for (int i = 0; i < evalNodes.size(); i++)
                    {
                        evalnet->Evaluate(evalNodes[i]);
-                        vector<pair<int, ElemType>> retPair;
+                        vector<pair<int, double>> retPair;
                        if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast<ComputationNode<ElemType>>(evalNodes[i])->FunctionValues(),
-                                                           from_token.score, best_score - beam, -numeric_limits<ElemType>::infinity(), retPair)
+                                                           from_token.score, best_score - beam, -numeric_limits<double>::infinity(), retPair)
                            == false)   // ==false??? !(.)?
                            continue;

                        evalnet->GetHistory(state.hidden_activity, true);
-                        for (typename vector<pair<int, ElemType>>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
+                        for (typename vector<pair<int, double>>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
                        {
                            vector<size_t> history = from_token.sequence;
                            history.push_back(itr->first);
@ -1169,7 +1168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    break;

                // beam pruning
-                const ElemType threshold = best_score - beam;
+                const double threshold = best_score - beam;
                while (!to_queue.empty())
                {
                    if (to_queue.top().score >= threshold)
@ -1189,7 +1188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                assert(best_path.empty());
                best_path.swap(const_cast<vector<size_t>&>(result_queue.top().sequence));
                {
-                    ElemType score = result_queue.top().score;
+                    double score = result_queue.top().score;
                    best_score = score;
                    fprintf(stderr, "best[%zd] score = %.4e\t", ibest, score);
                    if (best_path.size() > 0)
--- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
+++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
@ -6,7 +6,6 @@
 #pragma once

 #include "ComputationNetwork.h"
-#include "ComputationNetworkHelper.h"
 #include "DataReader.h"
 #include <vector>
 #include <string>
@ -20,19 +19,15 @@ using namespace std;
 namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType>
-    class SimpleOutputWriter : ComputationNetworkHelper<ElemType>
+    class SimpleOutputWriter
    {
-        typedef ComputationNetworkHelper<ElemType> B;
-        using B::UpdateEvalTimeStamps;
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;

    public:

-        SimpleOutputWriter(ComputationNetwork & net, int verbosity=0)
-            : m_net(net), m_verbosity(verbosity)
-        {
-
-        }
+        SimpleOutputWriter(ComputationNetwork & net, int verbosity = 0) :
+            m_net(net), m_verbosity(verbosity)
+        { }

        void WriteOutput(IDataReader<ElemType>& dataReader, size_t mbSize, IDataWriter<ElemType>& dataWriter, const std::vector<std::wstring>& outputNodeNames, size_t numOutputSamples=requestDataSize, bool doUnitTest = false)
        {
@ -74,8 +69,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            while (dataReader.GetMinibatch(inputMatrices))
            {
-                UpdateEvalTimeStamps(featureNodes);
-                UpdateEvalTimeStamps(labelNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(labelNodes);

                size_t actualMBSize = m_net.GetActualMBSize();
                m_net.SetActualMiniBatchSize(actualMBSize);
@ -157,7 +152,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            while (dataReader.GetMinibatch(inputMatrices))
            {
-                UpdateEvalTimeStamps(featureNodes);
+                ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

                size_t actualMBSize = m_net.GetActualMBSize();
                m_net.SetActualMiniBatchSize(actualMBSize);
--- a/MachineLearning/CNTKSGDLib/stdafx.cpp
+++ b/MachineLearning/CNTKSGDLib/stdafx.cpp
@ -0,0 +1,13 @@
+//
+// <copyright file="stdafx.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// stdafx.cpp : source file that includes just the standard includes
+// cn.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- a/MachineLearning/CNTKSGDLib/stdafx.h
+++ b/MachineLearning/CNTKSGDLib/stdafx.h
@ -0,0 +1,20 @@
+//
+// <copyright file="stdafx.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#ifdef _WIN32
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms
+#include "targetver.h"
+#endif
+
+#include <stdio.h>
+
+// TODO: reference additional headers your program requires here
--- a/MachineLearning/CNTKSGDLib/targetver.h
+++ b/MachineLearning/CNTKSGDLib/targetver.h
@ -0,0 +1,13 @@
+//
+// <copyright file="targetver.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include <SDKDDKVer.h>
--- a/12
+++ b/12
@ -50,7 +50,7 @@ endif
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
 CXX = mpic++

-INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK BrainScript
+INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
 CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
 CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror
 LIBPATH:=
@ -355,15 +355,17 @@ endif

 CNTK_SRC =\
 	MachineLearning/CNTK/CNTK.cpp \
-	MachineLearning/CNTK/ComputationNode.cpp \
 	MachineLearning/CNTK/ModelEditLanguage.cpp \
 	MachineLearning/CNTK/NetworkDescriptionLanguage.cpp \
-	MachineLearning/CNTK/Profiler.cpp \
-	MachineLearning/CNTK/ComputationNetwork.cpp \
-	MachineLearning/CNTK/ComputationNetworkBuilder.cpp \
 	MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
 	MachineLearning/CNTK/SynchronousExecutionEngine.cpp \
 	MachineLearning/CNTK/tests.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.cpp \
+	MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
+	MachineLearning/CNTKSGDLib/Profiler.cpp \
+	MachineLearning/CNTKSGDLib/SGD.cpp \
 	MachineLearning/CNTKEval/CNTKEval.cpp \
 	BrainScript/BrainScriptEvaluator.cpp \
 	BrainScript/BrainScriptParser.cpp \
--- a/Math/CNTKMathTest/MatrixBLASTests.cpp
+++ b/Math/CNTKMathTest/MatrixBLASTests.cpp
@ -6,6 +6,11 @@
 #include "stdafx.h"
 #include "CppUnitTest.h"
 #include "..\Math\Matrix.h"
+#include "..\Math\CPUMatrix.h"
+#include "..\Math\GPUMatrix.h"
+#include "..\Math\CPUSparseMatrix.h"
+#include "..\Math\GPUSparseMatrix.h"
+#include "..\Math\Helpers.h"

 #pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project

--- a/Math/CNTKMathTest/MatrixDataSynchronizationTests.cpp
+++ b/Math/CNTKMathTest/MatrixDataSynchronizationTests.cpp
@ -6,6 +6,11 @@
 #include "stdafx.h"
 #include "CppUnitTest.h"
 #include "..\Math\Matrix.h"
+#include "..\Math\CPUMatrix.h"
+#include "..\Math\GPUMatrix.h"
+#include "..\Math\CPUSparseMatrix.h"
+#include "..\Math\GPUSparseMatrix.h"
+#include "..\Math\Helpers.h"

 #define epsilon 0.000001
 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
--- a/Math/CNTKMathTest/MatrixFileWriteAndRead.cpp
+++ b/Math/CNTKMathTest/MatrixFileWriteAndRead.cpp
@ -4,14 +4,19 @@
 // </copyright>
 //
 #include "stdafx.h"
-#include <string>
+#include "..\..\common\include\Basics.h"
 #include "CppUnitTest.h"
 #include "..\Math\Matrix.h"
-#include "..\..\common\include\Basics.h"
+#include "..\Math\CPUMatrix.h"
+#include "..\Math\GPUMatrix.h"
+#include "..\Math\CPUSparseMatrix.h"
+#include "..\Math\GPUSparseMatrix.h"
+#include "..\Math\Helpers.h"
 #include "..\..\common\include\fileutil.h"
 #include "..\..\common\include\File.h"
 #include "..\..\common\File.cpp"
 #include "..\..\common\fileutil.cpp"
+#include <string>



--- a/Math/CNTKMathTest/MatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/MatrixUnitTests.cpp
@ -6,6 +6,11 @@
 #include "stdafx.h"
 #include "CppUnitTest.h"
 #include "..\Math\Matrix.h"
+#include "..\Math\CPUMatrix.h"
+#include "..\Math\GPUMatrix.h"
+#include "..\Math\CPUSparseMatrix.h"
+#include "..\Math\GPUSparseMatrix.h"
+#include "..\Math\Helpers.h"

 #pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project

--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -4,28 +4,16 @@
 // </copyright>
 //
 #pragma once
+#include "Basics.h" // for RuntimeError()
+#include "Matrix.h"
+#include "File.h"
+#include "Helpers.h"
+#include "CommonMatrix.h"
 #include <vector>
 #include <stdio.h>
 #include <ctime>
 #include <limits.h>
-#include "File.h"
-#include "Helpers.h"
-#include "CommonMatrix.h"
-#include "Basics.h" // for RuntimeError()

-#ifdef    _WIN32
-#ifdef MATH_EXPORTS
-#define MATH_API __declspec(dllexport)
-#else
-#define MATH_API __declspec(dllimport)
-#endif
-#else    // no DLLs on Linux
-#define    MATH_API 
-#endif
-
-#ifndef USE_TIME_BASED_SEED
-#define USE_TIME_BASED_SEED ULONG_MAX
-#endif
 // NOTE NOTE NOTE:
 // use CPUSingleMatrix and CPUDoubleMatrix instead of using the template directly
 ///////////////////////////////////////////////
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -1036,7 +1036,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return sum;
    }

-    template <class ElemType>
+    template <typename ElemType>
    MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
    {
        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
@ -1090,7 +1090,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);

-    template <class ElemType>
+    template <typename ElemType>
    MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
    {
        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
--- a/Math/Math/Helpers.h
+++ b/Math/Math/Helpers.h
@ -3,7 +3,10 @@
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
+
 //helpful macros
+// TODO: the file's name is too general to be included from outside; MathHelpers.h?
+
 //iterators
 #pragma once
 #undef foreach_row
--- a/Math/Math/Math.vcxproj
+++ b/Math/Math/Math.vcxproj
@ -22,7 +22,7 @@
    </SccLocalPath>
    <SccProvider>
    </SccProvider>
-    <ProjectName>CNTKMath</ProjectName>
+    <ProjectName>CNTKMathDll</ProjectName>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -6,8 +6,12 @@
 //
 #include "stdafx.h"
 #include "Basics.h"
-#include "fileutil.h"
 #include "Matrix.h"
+#include "CPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include "GPUMatrix.h"
+#include "GPUSparseMatrix.h"
+#include "fileutil.h"
 #include <assert.h>
 #include <math.h>
 #include "GPUWatcher.h"     // bring in this class as well so that it gets exported from this DLL
@ -164,7 +168,6 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 #pragma region Constructors, destructors and other static matrix builders

-
    //This function will only initialize default bland matrix. The actual matrices need to allocated
    //after calling this function and flags need to set correctly by calling SetDataLocation.
    template<class ElemType>
@ -563,6 +566,65 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return c;
    }

+    template<class ElemType>
+    void Matrix<ElemType>::Read(File& stream)
+    {
+        Matrix<ElemType>& M = *this;
+        char type;
+        stream >> type;
+        if (type == 'd')
+        {
+            if (M.GetDeviceId()<0)
+            {
+                if (M.m_CPUMatrix == NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
+                stream >> (*M.m_CPUMatrix);
+                M.SetDataLocation(CPU, DENSE);
+            }
+            else
+            {
+                if (M.m_GPUMatrix == NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
+                stream >> (*M.m_GPUMatrix);
+                M.SetDataLocation(GPU, DENSE);
+            }
+        }
+        else if (type == 's')
+        {
+            if (M.GetDeviceId()<0)
+            {
+                NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
+            }
+            else
+            {
+                if (M.m_GPUSparseMatrix == NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
+                stream >> (*M.m_GPUSparseMatrix);
+                M.SetDataLocation(GPU, SPARSE);
+            }
+        }
+        else
+            LogicError("wrong matrix type!");
+    }
+
+    template<class ElemType>
+    void Matrix<ElemType>::Write(File& stream) const
+    {
+        const Matrix<ElemType>& M = *this;
+        if (M.GetMatrixType() == MatrixType::DENSE)
+        {
+            stream << 'd';
+            if (M.GetDeviceId() < 0)
+                stream << (*M.m_CPUMatrix);
+            else
+                stream << (*M.m_GPUMatrix);
+        }
+        else
+        {
+            stream << 's';
+            if (M.GetDeviceId() < 0)
+                NOT_IMPLEMENTED //stream<<(*M.m_CPUMatrix);
+            else
+                stream << (*M.m_GPUSparseMatrix);
+        }
+    }

 #pragma endregion Constructors, destructors and other static matrix builders

@ -4740,7 +4802,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class Matrix<double>;    

    // We use Matrix<char> as the backing store for QuantizedMatrix
-    // Let's explciitly instantiate the methods we need for that purpose
+    // Let's explicitly instantiate the methods we need for that purpose
    template Matrix<char>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat);
    template Matrix<char>::Matrix(const size_t numRows, const size_t numCols, char *pArray, const size_t matrixFlags, DEVICEID_TYPE deviceId, const size_t nnz);
    template Matrix<char>::~Matrix();
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -6,10 +6,20 @@

 #pragma once

-#include "CPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include "GPUMatrix.h"
-#include "GPUSparseMatrix.h"
+#ifdef    _WIN32
+#ifdef MATH_EXPORTS
+#define MATH_API __declspec(dllexport)
+#else
+#define MATH_API __declspec(dllimport)
+#endif
+#else    // no DLLs on Linux
+#define    MATH_API 
+#endif
+
+#include "Basics.h"
+#include "File.h"
+#include "CommonMatrix.h"
+#include <limits.h>

 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -55,12 +65,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
       UNDETERMINED, DENSE, SPARSE
    };

+    // TODO: create an <ElemType>-agnostic base class, then move generic functions such as getting dims, resizing, and getting/setting as scalars
+    class MATH_API MatrixBase
+    {
+    protected:
+        //virtual ~MatrixBase() { };
+        // TODO: currently this causes link errors when building DLLs
+    };
+
+    // avoid pulling in these header files for consumers of this class
+    template<class ElemType> class GPUMatrix;
+    template<class ElemType> class CPUMatrix;
+    template<class ElemType> class GPUSparseMatrix;
+    template<class ElemType> class CPUSparseMatrix;
+    template<class ElemType> class DeviceBoundNumber;
+
    //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
    //convertion is need when passing data between Matrix and C++ matrices
    //For the best performance compile CNTKMath project with NO_SYNC preprocessor directive
    //!!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment    
    template<class ElemType>
-    class MATH_API Matrix 
+    class MATH_API Matrix : public MatrixBase
    {
    private:
        mutable BaseMatrix<ElemType> *m_baseMatrix;
@ -104,6 +129,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
        static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
        static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+
+#define USE_TIME_BASED_SEED ULONG_MAX
        static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId = AUTOPLACEMATRIX);
        static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);

@ -174,6 +201,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        void ShiftBy(int numShift);

+        // TODO: all these scalars should be passed as doubles and cast down inside
        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
        ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
        ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
@ -437,76 +465,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);

    public:
-        friend File& operator>>(File& stream, Matrix<ElemType>& M)
-        {
-            char type;
-            stream>>type;
-            if (type=='d')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    if (M.m_CPUMatrix==NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
-                    stream>>(*M.m_CPUMatrix);
-                    M.SetDataLocation(CPU, DENSE);
-                }
-                else
-                {
-                    if (M.m_GPUMatrix==NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
-                    stream>>(*M.m_GPUMatrix);  
-                    M.SetDataLocation(GPU, DENSE);
-                }                
-            }
-            else if (type=='s')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
-                }
-                else
-                {
-                    if (M.m_GPUSparseMatrix==NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
-                    stream>>(*M.m_GPUSparseMatrix); 
-                    M.SetDataLocation(GPU, SPARSE);
-                }                
-            }
-            else
-                LogicError("wrong matrix type!");
-            return stream;
+        void Read(File& stream);
+        void Write(File& stream) const;

-        }
-        friend File& operator<<(File& stream, const Matrix<ElemType>& M)
-        {
-            if (M.GetMatrixType()==MatrixType::DENSE)
-            {
-                stream<<'d';
-                if (M.GetDeviceId()<0)
-                {
-                    stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUMatrix);
-                }                
-            }
-            else
-            {
-                stream<<'s';
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;
-                    //stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUSparseMatrix);
-                }           
-            }
-            return stream;
-        }
-
-    public:
-
-		public:
        Matrix<ElemType>& Shift(const Matrix<ElemType>& a, int shift);

        Matrix<ElemType>& AssignElementProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift, size_t negnumber);
@ -536,6 +497,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        friend class QuantizedMatrix;
    };

+    // overload I/O operators
+    template<class ElemType>
+    File& operator>>(File& stream, Matrix<ElemType>& M) { M.Read(stream); return stream; }
+    template<class ElemType>
+    File& operator<<(File& stream, const Matrix<ElemType>& M) { M.Write(stream); return stream; }
+
    typedef Matrix<float> SingleMatrix;
    typedef Matrix<double> DoubleMatrix;
 }}}
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -359,7 +359,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class GPUSparseMatrix<float>;
    template class GPUSparseMatrix<double>;

-    template <class ElemType>
+    template <typename ElemType>
    MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
    {
        return stream;
@ -368,7 +368,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<float>& us);
    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<double>& us);

-    template <class ElemType>
+    template <typename ElemType>
    MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemType>& us)
    {
        return stream;
--- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/testcases.yml
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/testcases.yml
@ -13,16 +13,16 @@ testCases:
      - ^MPI Rank {{integer}}
      - Finished Epoch[{{integer}}]
      - TrainLossPerSample = {{float,tolerance=0.001%}}
-      - EvalErrPerSample = {{float,tolerance=0%}}
-      - Ave LearnRatePerSample = {{float,tolerance=0%}}
+      - EvalErrPerSample = {{float,tolerance=0.01%}}
+      - Ave LearnRatePerSample = {{float,tolerance=0.01%}}

  Per-minibatch training results must match for each MPI Rank:
    patterns:
      - ^MPI Rank {{integer}}
      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}]
      - SamplesSeen = {{integer}}
-      - TrainLossPerSample = {{float,tolerance=0.001%}}
-      - EvalErr[0]PerSample = {{float,tolerance=0%}}
+      - TrainLossPerSample = {{float,tolerance=0.1%}}
+      - EvalErr[0]PerSample = {{float,tolerance=0.01%}}

  DataParallelSGD training parameters must match for each MPI Rank:
    patterns:
--- a/Tests/Speech/README.txt
+++ b/Tests/Speech/README.txt
@ -16,6 +16,9 @@ Command lines for debugging
 WORKING DIR: $(SolutionDir)Tests\Speech\Data
 COMMAND:     configFile=$(SolutionDir)Tests\Speech\QuickE2E\cntk.config  stderr=$(SolutionDir)Tests\Speech\RunDir\QuickE2E\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\Speech\RunDir\QuickE2E  DataDir=$(SolutionDir)Tests\Speech\Data  DeviceId=Auto

+Linux:
+bin/cntk configFile=Tests/Speech/QuickE2E/cntk.config RunDir=Tests/Speech/RunDirL/QuickE2E DataDir=Tests/Speech/Data DeviceId=0
+
 # TODO: can stderr refer to RunDir?

 --- LSTM: