Merge branch 'master' of https://git01.codeplex.com/cntk into yu/memshare

Conflicts: MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h MachineLearning/CNTKComputationNetworkLib/ComputationNode.h MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h Tests/Speech/LSTM/cntk.config configure
2015-10-15 21:08:10 -07:00 · 2015-10-15 21:08:10 -07:00 · 35b0e71b5a
--- a/.gitattributes
+++ b/.gitattributes
@ -1 +1,2 @@
 run-test text eol=lf
+run-test-common text eol=lf
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1,7 +1,7 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
-VisualStudioVersion = 12.0.21005.1
+VisualStudioVersion = 12.0.40629.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathDll", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
@ -167,6 +167,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CheckInSuites", "CheckInSui
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SparsePCReader", "DataReader\SparsePCReader\SparsePCReader.vcxproj", "{CE429AA2-3778-4619-8FD1-49BA3B81197B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Speech", "Speech", "{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}"
 	ProjectSection(SolutionItems) = preProject
@ -201,6 +204,11 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{19EE975B-2
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ParseConfig", "MachineLearning\ParseConfig\ParseConfig.vcxproj", "{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageReader", "DataReader\ImageReader\ImageReader.vcxproj", "{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKComputationNetworkLib", "MachineLearning\CNTKComputationNetworkLib\CNTKComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
@ -349,6 +357,50 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FullUtterance", "FullUttera
 		Tests\Speech\LSTM\FullUtterance\testcases.yml = Tests\Speech\LSTM\FullUtterance\testcases.yml
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DNN", "DNN", "{6994C86D-A672-4254-824A-51F4DFEB807F}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\DNN\cntk.config = Tests\Speech\DNN\cntk.config
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Parallel1BitQuantization", "Parallel1BitQuantization", "{FD88A77E-A839-4513-AC5C-AD92447BE229}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\DNN\Parallel1BitQuantization\baseline.cpu.txt = Tests\Speech\DNN\Parallel1BitQuantization\baseline.cpu.txt
+		Tests\Speech\DNN\Parallel1BitQuantization\baseline.gpu.txt = Tests\Speech\DNN\Parallel1BitQuantization\baseline.gpu.txt
+		Tests\Speech\DNN\Parallel1BitQuantization\baseline.windows.cpu.txt = Tests\Speech\DNN\Parallel1BitQuantization\baseline.windows.cpu.txt
+		Tests\Speech\DNN\Parallel1BitQuantization\baseline.windows.gpu.txt = Tests\Speech\DNN\Parallel1BitQuantization\baseline.windows.gpu.txt
+		Tests\Speech\DNN\Parallel1BitQuantization\run-test = Tests\Speech\DNN\Parallel1BitQuantization\run-test
+		Tests\Speech\DNN\Parallel1BitQuantization\testcases.yml = Tests\Speech\DNN\Parallel1BitQuantization\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelNoQuantization", "ParallelNoQuantization", "{40F93928-5CA2-433A-A48C-C8E9A35D7079}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\DNN\ParallelNoQuantization\baseline.cpu.txt = Tests\Speech\DNN\ParallelNoQuantization\baseline.cpu.txt
+		Tests\Speech\DNN\ParallelNoQuantization\baseline.gpu.txt = Tests\Speech\DNN\ParallelNoQuantization\baseline.gpu.txt
+		Tests\Speech\DNN\ParallelNoQuantization\baseline.windows.cpu.txt = Tests\Speech\DNN\ParallelNoQuantization\baseline.windows.cpu.txt
+		Tests\Speech\DNN\ParallelNoQuantization\baseline.windows.gpu.txt = Tests\Speech\DNN\ParallelNoQuantization\baseline.windows.gpu.txt
+		Tests\Speech\DNN\ParallelNoQuantization\run-test = Tests\Speech\DNN\ParallelNoQuantization\run-test
+		Tests\Speech\DNN\ParallelNoQuantization\testcases.yml = Tests\Speech\DNN\ParallelNoQuantization\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DiscriminativePreTraining", "DiscriminativePreTraining", "{39B9BB97-D0E8-439A-8A1B-8DB8E7CF73C3}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\DNN\DiscriminativePreTraining\baseline.cpu.txt = Tests\Speech\DNN\DiscriminativePreTraining\baseline.cpu.txt
+		Tests\Speech\DNN\DiscriminativePreTraining\baseline.gpu.txt = Tests\Speech\DNN\DiscriminativePreTraining\baseline.gpu.txt
+		Tests\Speech\DNN\DiscriminativePreTraining\baseline.windows.cpu.txt = Tests\Speech\DNN\DiscriminativePreTraining\baseline.windows.cpu.txt
+		Tests\Speech\DNN\DiscriminativePreTraining\baseline.windows.gpu.txt = Tests\Speech\DNN\DiscriminativePreTraining\baseline.windows.gpu.txt
+		Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config = Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config
+		Tests\Speech\DNN\DiscriminativePreTraining\run-test = Tests\Speech\DNN\DiscriminativePreTraining\run-test
+		Tests\Speech\DNN\DiscriminativePreTraining\testcases.yml = Tests\Speech\DNN\DiscriminativePreTraining\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ndl", "ndl", "{09B8623A-BF5D-4499-B3B3-A9EFEA4C4B68}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\Speech\Data\ndl\add_layer.mel = Tests\Speech\Data\ndl\add_layer.mel
+		Tests\Speech\Data\ndl\dnn.txt = Tests\Speech\Data\ndl\dnn.txt
+		Tests\Speech\Data\ndl\dnn_1layer.txt = Tests\Speech\Data\ndl\dnn_1layer.txt
+		Tests\Speech\Data\ndl\macros.txt = Tests\Speech\Data\ndl\macros.txt
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@ -432,51 +484,61 @@ Global
 		{EAD17188-072C-4726-B840-A769C36DAD1B}.Debug|x64.Build.0 = Debug|x64
 		{EAD17188-072C-4726-B840-A769C36DAD1B}.Release|x64.ActiveCfg = Release|x64
 		{EAD17188-072C-4726-B840-A769C36DAD1B}.Release|x64.Build.0 = Release|x64
+		{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}.Debug|x64.ActiveCfg = Debug|x64
+		{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}.Debug|x64.Build.0 = Debug|x64
+		{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}.Release|x64.ActiveCfg = Release|x64
+		{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(NestedProjects) = preSolution
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{EAD17188-072C-4726-B840-A769C36DAD1B} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{065AF55D-AF02-448B-BFCD-52619FDA4BD0} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 		{3ED0465D-23E7-4855-9694-F788717B6533} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
-		{3E9C89B1-C045-4F42-92B2-F9FFFFC2DBD4} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
-		{C70E1572-20FF-496C-A0A9-10AA6755A07C} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{065AF55D-AF02-448B-BFCD-52619FDA4BD0} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 		{98D2C32B-0C1F-4E19-A626-65F7BA4600CF} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{EA67F51F-1FE8-462D-9F3E-01161685AD59} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{63024704-A2D7-497E-AD4B-5C10C6AA1374} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{F9BEB27E-8AF5-464E-8D45-0000D5AFA2D3} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
 		{889C1CCF-92B3-450B-B00D-FC9A9D5BE464} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
+		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{4BBF2950-3DBD-469A-AD57-6CACBEBAF541} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{5F733BBA-FE83-4668-8F83-8B0E78A36619} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{19EE975B-232D-49F0-94C7-6F1C6424FB53} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
-		{88F85A64-105D-4CDA-8199-B7A312FC8A27} = {19EE975B-232D-49F0-94C7-6F1C6424FB53}
-		{8241108A-7824-4FF2-BECA-7521A9D89DCF} = {19EE975B-232D-49F0-94C7-6F1C6424FB53}
+		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{6D1353D6-F196-466F-B886-F16D48759B20} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
 		{A4884465-CFBB-4A64-A9DE-690E1A63EF7E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
+		{9BD0A746-0BBD-45B6-B81C-053F03C26CFB} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{3E9C89B1-C045-4F42-92B2-F9FFFFC2DBD4} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{C70E1572-20FF-496C-A0A9-10AA6755A07C} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{EAD17188-072C-4726-B840-A769C36DAD1B} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{88F85A64-105D-4CDA-8199-B7A312FC8A27} = {19EE975B-232D-49F0-94C7-6F1C6424FB53}
+		{8241108A-7824-4FF2-BECA-7521A9D89DCF} = {19EE975B-232D-49F0-94C7-6F1C6424FB53}
+		{6994C86D-A672-4254-824A-51F4DFEB807F} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
+		{FD88A77E-A839-4513-AC5C-AD92447BE229} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{40F93928-5CA2-433A-A48C-C8E9A35D7079} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{39B9BB97-D0E8-439A-8A1B-8DB8E7CF73C3} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{09B8623A-BF5D-4499-B3B3-A9EFEA4C4B68} = {5F733BBA-FE83-4668-8F83-8B0E78A36619}
 	EndGlobalSection
 EndGlobal
--- a/Common/Include/Sequences.h
+++ b/Common/Include/Sequences.h
@ -159,9 +159,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // HAH! This function is only ever used for Decimate(). It can completely go away, as can methods of the same name in the readers!
        //bool RequireSentenceSeg() const { return m_dataIsSequential; }        // this is the name of a function on DataReader which really belongs here

-#if 0   // (I thought I need this, but don't. Keeping it anyway, maybe we need it again in the future.)
        // compute the number of actual samples in this layout (not counting NoLabel ones)
-        // This is only expensive for a weirdo configuration of multiple variable-length sequences that still normalizes the gradient over the total # seen samples.
+        // This is used by MeanNode and InvStdDevNode.
        size_t DetermineActualNumSamples() const
        {
            size_t n = GetNumTimeSteps() * GetNumParallelSequences();
@ -178,7 +177,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            return n;
        }
-#endif

    private:
        size_t m_numTimeSteps;
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@ -84,10 +84,12 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
 #include <locale>       // std::wstring_convert
 #include <string>
 #include <algorithm>    // for transform()
-#include <mutex>
 #include <unordered_map>
 #include <chrono>
 #include <thread>
+#include <stack>
+#include <mutex>
+#include <memory>
 #ifdef _MSC_VER
 #include <codecvt>      // std::codecvt_utf8
 #endif
@ -1004,4 +1006,59 @@ static inline std::wstring FormatWin32Error(DWORD error)
 	return res;
 }
 #endif // _WIN32
+
+// Very simple version of thread-safe stack. Add other functions as needed.
+template<typename T>
+class conc_stack
+{
+public:
+    typedef typename std::stack<T>::value_type value_type;
+
+    conc_stack() {}
+
+    value_type pop_or_create(std::function<value_type()> factory)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        if (m_stack.size() == 0)
+            return factory();
+        auto res = std::move(m_stack.top());
+        m_stack.pop();
+        return res;
+    }
+
+    void push(const value_type& item)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        m_stack.push(item);
+    }
+
+    void push(value_type&& item)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        m_stack.push(std::forward<value_type>(item));
+    }
+
+public:
+    conc_stack(const conc_stack&) = delete;
+    conc_stack& operator=(const conc_stack&) = delete;
+    conc_stack(conc_stack&&) = delete;
+    conc_stack& operator=(conc_stack&&) = delete;
+
+private:
+    std::stack<value_type> m_stack;
+    std::mutex m_locker;
+};
+
+// make_unique was added in GCC 4.9.0
+#if __GNUC__ >= 4 && __GNUC_MINOR__ < 9
+namespace std
+{
+    template<typename T, typename... Args>
+    std::unique_ptr<T> make_unique(Args&&... args)
+    {
+        return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+    }
+}
+#endif
+
 #endif    // _BASETYPES_
--- a/Common/Include/commandArgUtil.h
+++ b/Common/Include/commandArgUtil.h
@ -494,10 +494,19 @@ public:
                    // check for custom separator character
                    // If the opening brace is immediately followed by any of the customSeparators,
                    // change m_separator (inside seps) to that character.
-                    // TODO: document what this is for, where it is used [fseide]
+                    // The parser lets you change the default separator to something else. For example the default separator for an array is usually the ‘:’ (I think)
+                    // (12:45:23:46)
+                    // However if you are using strings, and one of those strings contains a ‘:’, you might want to change the separator to something else:
+                    // (;this;is;a;path:;c:\mydir\stuff)
+                    // 
+                    // This will fail for
+                    // (..\dirname,something else)
+                    // Hence there is an ugly fix for it below. This will go away when we replace all configuration parsing by BrainScript.
                    const static std::string customSeperators = "`~!@$%^&*_-+|:;,?.";

-                    if (customSeperators.find(stringParse[tokenStart]) != npos)
+                    if (customSeperators.find(stringParse[tokenStart]) != npos
+                        && stringParse.substr(tokenStart).find("..") != 0 && stringParse.substr(tokenStart).find(".\\") != 0 && stringParse.substr(tokenStart).find("./") != 0         // [fseide] otherwise this will nuke leading . or .. in a pathname... Aargh!
+                        )
                    {
                        char separator = stringParse[tokenStart];
                        // this was m_separator; on content level, we change it to a custom separator (it gets changed back when we exit content level)
--- a/DataReader/ImageReader/Exports.cpp
+++ b/DataReader/ImageReader/Exports.cpp
@ -0,0 +1,30 @@
+//
+// <copyright file="Exports.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// Exports.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#define DATAREADER_EXPORTS
+#include "DataReader.h"
+#include "ImageReader.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+void DATAREADER_API GetReader(IDataReader<ElemType>** preader)
+{
+    *preader = new ImageReader<ElemType>();
+}
+
+extern "C" DATAREADER_API void GetReaderF(IDataReader<float>** preader)
+{
+    GetReader(preader);
+}
+extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
+{
+    GetReader(preader);
+}
+}}}
--- a/DataReader/ImageReader/ImageReader.cpp
+++ b/DataReader/ImageReader/ImageReader.cpp
@ -0,0 +1,470 @@
+//
+// <copyright company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include "stdafx.h"
+#define DATAREADER_EXPORTS  // creating the exports here
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <opencv2/opencv.hpp>
+#include "DataReader.h"
+#include "ImageReader.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+static bool AreEqual(const std::string& s1, const std::string& s2)
+{
+    return std::equal(s1.begin(), s1.end(), s2.begin(), [](const char& a, const char& b) { return std::tolower(a) == std::tolower(b); });
+};
+
+//-------------------
+// Transforms
+
+class ITransform
+{
+public:
+    virtual void Init(const ConfigParameters& config) = 0;
+    virtual void Apply(cv::Mat& mat) = 0;
+
+    ITransform() {};
+    virtual ~ITransform() {};
+public:
+    ITransform(const ITransform&) = delete;
+    ITransform& operator=(const ITransform&) = delete;
+    ITransform(ITransform&&) = delete;
+    ITransform& operator=(ITransform&&) = delete;
+};
+
+class CropTransform : public ITransform
+{
+public:
+    CropTransform(unsigned int seed) : m_seed(seed)
+    {
+    }
+
+    void Init(const ConfigParameters& config)
+    {
+        m_cropType = ParseCropType(config("cropType", ""));
+
+        std::stringstream ss{ config("cropRatio", "1") };
+        std::string token{ "" };
+        if (std::getline(ss, token, ':'))
+        {
+            m_cropRatioMin = std::stof(token);
+            m_cropRatioMax = std::getline(ss, token, ':') ? std::stof(token) : m_cropRatioMin;
+        }
+
+        if (!(0 < m_cropRatioMin && m_cropRatioMin <= 1.0) || 
+            !(0 < m_cropRatioMax && m_cropRatioMax <= 1.0) ||
+            m_cropRatioMin > m_cropRatioMax)
+        {
+            RuntimeError("Invalid cropRatio value, must be > 0 and <= 1. cropMin must <= cropMax");
+        }
+
+        m_jitterType = ParseJitterType(config("jitterType", ""));
+
+        if (!config.ExistsCurrent("hflip"))
+            m_hFlip = m_cropType == CropType::Random;
+        else
+            m_hFlip = std::stoi(config("hflip")) != 0;
+    }
+
+    void Apply(cv::Mat& mat)
+    {
+        auto seed = m_seed;
+        auto rng = m_rngs.pop_or_create([seed]() { return std::make_unique<std::mt19937>(seed); });
+
+        double ratio = 1;
+        switch (m_jitterType)
+        {
+        case RatioJitterType::None:
+            ratio = m_cropRatioMin;
+            break;
+        case RatioJitterType::UniRatio:
+            ratio = UniRealT(m_cropRatioMin, m_cropRatioMax)(*rng);
+            assert(m_cropRatioMin <= ratio && ratio < m_cropRatioMax);
+            break;
+        default:
+            RuntimeError("Jitter type currently not implemented.");
+        }
+        mat = mat(GetCropRect(m_cropType, mat.rows, mat.cols, ratio, *rng));
+        if (m_hFlip && std::bernoulli_distribution()(*rng))
+            cv::flip(mat, mat, 1);
+        
+        m_rngs.push(std::move(rng));
+    }
+
+private:
+    using UniRealT = std::uniform_real_distribution<double>;
+    using UniIntT = std::uniform_int_distribution<int>;
+
+    enum class CropType { Center = 0, Random = 1 };
+    enum class RatioJitterType
+    { 
+        None = 0,
+        UniRatio = 1,
+        UniLength = 2,
+        UniArea = 3
+    };
+
+    CropType ParseCropType(const std::string& src)
+    {
+        if (src.empty() || AreEqual(src, "center"))
+            return CropType::Center;
+        if (AreEqual(src, "random"))
+            return CropType::Random;
+
+        RuntimeError("Invalid crop type: %s.", src.c_str());
+    }
+
+    RatioJitterType ParseJitterType(const std::string& src)
+    {
+        if (src.empty() || AreEqual(src, "none"))
+            return RatioJitterType::None;
+        if (AreEqual(src, "uniratio"))
+            return RatioJitterType::UniRatio;
+        if (AreEqual(src, "unilength"))
+            return RatioJitterType::UniLength;
+        if (AreEqual(src, "uniarea"))
+            return RatioJitterType::UniArea;
+
+        RuntimeError("Invalid jitter type: %s.", src.c_str());
+    }
+
+    cv::Rect GetCropRect(CropType type, int crow, int ccol, double cropRatio, std::mt19937& rng)
+    {
+        assert(crow > 0);
+        assert(ccol > 0);
+        assert(0 < cropRatio && cropRatio <= 1.0);
+
+        int cropSize = static_cast<int>(std::min(crow, ccol) * cropRatio);
+        int xOff = -1;
+        int yOff = -1;
+        switch (type)
+        {
+        case CropType::Center:
+            xOff = (ccol - cropSize) / 2;
+            yOff = (crow - cropSize) / 2;
+            break;
+        case CropType::Random:
+            xOff = UniIntT(0, ccol - cropSize)(rng);
+            yOff = UniIntT(0, crow - cropSize)(rng);
+            break;
+        default:
+            assert(false);
+        }
+
+        assert(0 <= xOff && xOff <= ccol - cropSize);
+        assert(0 <= yOff && yOff <= crow - cropSize);
+        return cv::Rect(xOff, yOff, cropSize, cropSize);
+    }
+
+private:
+    unsigned int m_seed;
+    conc_stack<std::unique_ptr<std::mt19937>> m_rngs;
+
+    CropType m_cropType;
+    double m_cropRatioMin;
+    double m_cropRatioMax;
+    RatioJitterType m_jitterType;
+    bool m_hFlip;
+};
+
+class ScaleTransform : public ITransform
+{
+public:
+    ScaleTransform(int dataType, unsigned int seed) : m_dataType(dataType), m_seed(seed)
+    {
+        assert(m_dataType == CV_32F || m_dataType == CV_64F);
+
+        m_interpMap.emplace("nearest", cv::INTER_NEAREST);
+        m_interpMap.emplace("linear", cv::INTER_LINEAR);
+        m_interpMap.emplace("cubic", cv::INTER_CUBIC);
+        m_interpMap.emplace("lanczos", cv::INTER_LANCZOS4);
+    }
+
+    void Init(const ConfigParameters& config)
+    {
+        m_imgWidth = config("width");
+        m_imgHeight = config("height");
+        m_imgChannels = config("channels");
+        size_t cfeat = m_imgWidth * m_imgHeight * m_imgChannels;
+        if (cfeat == 0 || cfeat > std::numeric_limits<size_t>().max() / 2)
+            RuntimeError("Invalid image dimensions.");
+
+        m_interp.clear();
+        std::stringstream ss{ config("interpolations", "") };
+        for (std::string token = ""; std::getline(ss, token, ':');)
+        {
+            // Explicit cast required for GCC.
+            std::transform(token.begin(), token.end(), token.begin(), (int (*)(int))std::tolower);
+            StrToIntMapT::const_iterator res = m_interpMap.find(token);
+            if (res != m_interpMap.end())
+                m_interp.push_back((*res).second);
+        }
+
+        if (m_interp.size() == 0)
+            m_interp.push_back(cv::INTER_LINEAR);
+    }
+
+    void Apply(cv::Mat& mat)
+    {
+        // If matrix has not been converted to the right type, do it now as rescaling requires floating point type.
+        if (mat.type() != CV_MAKETYPE(m_dataType, m_imgChannels))
+            mat.convertTo(mat, m_dataType);
+
+        auto seed = m_seed;
+        auto rng = m_rngs.pop_or_create([seed]() { return std::make_unique<std::mt19937>(seed); });
+
+        assert(m_interp.size() > 0);
+        cv::resize(mat, mat, cv::Size(static_cast<int>(m_imgWidth), static_cast<int>(m_imgHeight)), 0, 0, 
+            m_interp[UniIntT(0, static_cast<int>(m_interp.size()) - 1)(*rng)]);
+
+        m_rngs.push(std::move(rng));
+    }
+
+private:
+    using UniIntT = std::uniform_int_distribution<int>;
+
+    unsigned int m_seed;
+    conc_stack<std::unique_ptr<std::mt19937>> m_rngs;
+
+    int m_dataType;
+
+    using StrToIntMapT = std::unordered_map<std::string, int>;
+    StrToIntMapT m_interpMap;
+    std::vector<int> m_interp;
+
+    size_t m_imgWidth;
+    size_t m_imgHeight;
+    size_t m_imgChannels;
+};
+
+class MeanTransform : public ITransform
+{
+public:
+    MeanTransform()
+    {
+    }
+
+    void Init(const ConfigParameters& config)
+    {
+        std::wstring meanFile = config(L"meanFile", L"");
+        if (meanFile.empty())
+            m_meanImg.release();
+        else
+        {
+            cv::FileStorage fs;
+            // REVIEW alexeyk: this sort of defeats the purpose of using wstring at all...
+            auto fname = msra::strfun::utf8(meanFile);
+            fs.open(fname, cv::FileStorage::READ);
+            if (!fs.isOpened())
+                RuntimeError("Could not open file: " + fname);
+            fs["MeanImg"] >> m_meanImg;
+            int cchan;
+            fs["Channel"] >> cchan;
+            int crow;
+            fs["Row"] >> crow;
+            int ccol;
+            fs["Col"] >> ccol;
+            if (cchan * crow * ccol != m_meanImg.channels() * m_meanImg.rows * m_meanImg.cols)
+                RuntimeError("Invalid data in file: " + fname);
+            fs.release();
+            m_meanImg = m_meanImg.reshape(cchan, crow);
+        }
+    }
+
+    void Apply(cv::Mat& mat)
+    {
+        assert(m_meanImg.size() == cv::Size(0, 0) || (m_meanImg.size() == mat.size() && m_meanImg.channels() == mat.channels()));
+
+        // REVIEW alexeyk: check type conversion (float/double).
+        if (m_meanImg.size() == mat.size())
+            mat = mat - m_meanImg;
+    }
+
+private:
+    cv::Mat m_meanImg;
+};
+
+//-------------------
+// ImageReader
+
+template<class ElemType>
+ImageReader<ElemType>::ImageReader() : m_seed(0), m_rng(m_seed), m_imgListRand(true), m_pMBLayout(make_shared<MBLayout>())
+{
+    m_transforms.push_back(std::make_unique<CropTransform>(m_seed));
+    m_transforms.push_back(std::make_unique<ScaleTransform>(sizeof(ElemType) == 4 ? CV_32F : CV_64F, m_seed));
+    m_transforms.push_back(std::make_unique<MeanTransform>());
+}
+
+template<class ElemType>
+ImageReader<ElemType>::~ImageReader()
+{
+}
+
+template<class ElemType>
+void ImageReader<ElemType>::Init(const ConfigParameters& config)
+{
+    using SectionT = std::pair<std::string, ConfigParameters>;
+    auto gettter = [&](const std::string& paramName) -> SectionT
+    {
+        auto sect = std::find_if(config.begin(), config.end(),
+            [&](const std::pair<std::string, ConfigValue>& p) { return ConfigParameters(p.second).ExistsCurrent(paramName); });
+        if (sect == config.end())
+            RuntimeError("ImageReader requires " + paramName + " parameter.");
+        return{ (*sect).first, ConfigParameters((*sect).second) };
+    };
+
+    // REVIEW alexeyk: currently support only one feature and label section.
+    SectionT featSect{ gettter("width") };
+    m_featName = msra::strfun::utf16(featSect.first);
+    // REVIEW alexeyk: w, h and c will be read again in ScaleTransform.
+    size_t w = featSect.second("width");
+    size_t h = featSect.second("height");
+    size_t c = featSect.second("channels");
+    m_featDim = w * h * c;
+
+    // Initialize transforms.
+    for (auto& t: m_transforms)
+        t->Init(featSect.second);
+
+    SectionT labSect{ gettter("labelDim") };
+    m_labName = msra::strfun::utf16(labSect.first);
+    m_labDim = labSect.second("labelDim");
+
+    std::string mapPath = config("file");
+    std::ifstream mapFile(mapPath);
+    if (!mapFile)
+        RuntimeError("Could not open " + mapPath + " for reading.");
+
+    std::string line{ "" };
+    for (size_t cline = 0; std::getline(mapFile, line); cline++)
+    {
+        std::stringstream ss{ line };
+        std::string imgPath;
+        std::string clsId;
+        if (!std::getline(ss, imgPath, '\t') || !std::getline(ss, clsId, '\t'))
+            RuntimeError("Invalid map file format, must contain 2 tab-delimited columns: %s, line: %d.", mapPath.c_str(), cline);
+        files.push_back({ imgPath, std::stoi(clsId) });
+    }
+
+    std::string rand = config("randomize", "auto");
+    if (AreEqual(rand, "none"))
+        m_imgListRand = false;
+    else if (!AreEqual(rand, "auto"))
+        RuntimeError("Only Auto and None are currently supported.");
+
+    m_epochStart = 0;
+    m_mbStart = 0;
+}
+
+template<class ElemType>
+void ImageReader<ElemType>::Destroy()
+{
+}
+
+template<class ElemType>
+void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+{
+    assert(mbSize > 0);
+    assert(requestedEpochSamples > 0);
+
+    if (m_imgListRand)
+        std::shuffle(files.begin(), files.end(), m_rng);
+
+    m_epochSize = (requestedEpochSamples == requestDataSize ? files.size() : requestedEpochSamples);
+    m_mbSize = mbSize;
+    // REVIEW alexeyk: if user provides epoch size explicitly then we assume epoch size is a multiple of mbsize, is this ok?
+    assert(requestedEpochSamples == requestDataSize || (m_epochSize % m_mbSize) == 0);
+    m_epoch = epoch;
+    m_epochStart = m_epoch * m_epochSize;
+    if (m_epochStart >= files.size())
+    {
+        m_epochStart = 0;
+        m_mbStart = 0;
+    }
+
+    m_featBuf.resize(m_mbSize * m_featDim);
+    m_labBuf.resize(m_mbSize * m_labDim);
+}
+
+template<class ElemType>
+bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+    assert(matrices.size() > 0);
+    assert(matrices.find(m_featName) != matrices.end());
+    assert(m_mbSize > 0);
+
+    Matrix<ElemType>& features = *matrices[m_featName];
+    Matrix<ElemType>& labels = *matrices[m_labName];
+
+    if (m_mbStart >= files.size() || m_mbStart >= m_epochStart + m_epochSize)
+        return false;
+
+    size_t mbLim = m_mbStart + m_mbSize;
+    if (mbLim > files.size())
+        mbLim = files.size();
+
+    std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
+    
+#pragma omp parallel for ordered schedule(dynamic)
+    for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
+    {
+        const auto& p = files[i + m_mbStart];
+        cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) };
+        for (auto& t: m_transforms)
+            t->Apply(img);
+       
+        assert(img.isContinuous());
+        auto data = reinterpret_cast<ElemType*>(img.ptr());
+        std::copy(data, data + m_featDim, m_featBuf.begin() + m_featDim * i);
+        m_labBuf[m_labDim * i + p.second] = 1;
+    }
+
+    size_t mbSize = mbLim - m_mbStart;
+    features.SetValue(m_featDim, mbSize, m_featBuf.data(), matrixFlagNormal);
+    labels.SetValue(m_labDim, mbSize, m_labBuf.data(), matrixFlagNormal);
+    m_pMBLayout->Init(mbSize, 1, false);
+
+    m_mbStart = mbLim;
+    return true;
+}
+
+template<class ElemType>
+bool ImageReader<ElemType>::DataEnd(EndDataType endDataType)
+{
+    bool ret = false;
+    switch (endDataType)
+    {
+    case endDataNull:
+        assert(false);
+        break;
+    case endDataEpoch:
+        ret = m_mbStart < m_epochStart + m_epochSize;
+        break;
+    case endDataSet:
+        ret = m_mbStart >= files.size();
+        break;
+    case endDataSentence:
+        ret = true;
+        break;
+    }
+    return ret;
+}
+
+template<class ElemType>
+void ImageReader<ElemType>::SetRandomSeed(unsigned int seed)
+{
+    m_seed = seed;
+    m_rng.seed(m_seed);
+}
+
+template class ImageReader<double>;
+template class ImageReader<float>;
+
+}}}
--- a/DataReader/ImageReader/ImageReader.h
+++ b/DataReader/ImageReader/ImageReader.h
@ -0,0 +1,68 @@
+//
+// <copyright company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+#include <random>
+#include <memory>
+#include "DataReader.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// REVIEW alexeyk: can't put it into ImageReader itself as ImageReader is a template.
+class ITransform;
+
+template<class ElemType>
+class ImageReader : public IDataReader<ElemType>
+{
+public:
+    ImageReader();
+    virtual ~ImageReader();
+
+    ImageReader(const ImageReader&) = delete;
+    ImageReader& operator=(const ImageReader&) = delete;
+    ImageReader(ImageReader&&) = delete;
+    ImageReader& operator=(ImageReader&&) = delete;
+
+    void Init(const ConfigParameters& config) override;
+    void Destroy() override;
+    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
+    bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) override;
+    bool DataEnd(EndDataType endDataType) override;
+
+    size_t GetNumParallelSequences() override  { return m_pMBLayout->GetNumParallelSequences(); }
+    void CopyMBLayoutTo(MBLayoutPtr pMBLayout) override { pMBLayout->CopyFrom(m_pMBLayout); };
+
+    void SetRandomSeed(unsigned int seed) override;
+
+private:
+    unsigned int m_seed;
+    std::mt19937 m_rng;
+
+    std::vector<std::unique_ptr<ITransform>> m_transforms;
+
+    std::wstring m_featName;
+    std::wstring m_labName;
+
+    size_t m_featDim;
+    size_t m_labDim;
+
+    using StrIntPairT = std::pair<std::string, int>;
+    std::vector<StrIntPairT> files;
+
+    size_t m_epochSize;
+    size_t m_mbSize;
+    size_t m_epoch;
+
+    size_t m_epochStart;
+    size_t m_mbStart;
+    std::vector<ElemType> m_featBuf;
+    std::vector<ElemType> m_labBuf;
+
+    bool m_imgListRand;
+
+    MBLayoutPtr m_pMBLayout;
+};
+}}}
--- a/DataReader/ImageReader/ImageReader.vcxproj
+++ b/DataReader/ImageReader/ImageReader.vcxproj
@ -0,0 +1,147 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{9BD0A746-0BBD-45B6-B81C-053F03C26CFB}</ProjectGuid>
+    <SccProjectName>
+    </SccProjectName>
+    <SccAuxPath>
+    </SccAuxPath>
+    <SccLocalPath>
+    </SccLocalPath>
+    <SccProvider>
+    </SccProvider>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ImageReader</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Choose>
+    <When Condition="Exists('$(OPENCV_PATH)')">
+      <PropertyGroup>
+        <HasOpenCV>true</HasOpenCV>
+      </PropertyGroup>
+    </When>
+    <Otherwise>
+      <PropertyGroup>
+        <HasOpenCV>false</HasOpenCV>
+      </PropertyGroup>
+    </Otherwise>
+  </Choose>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IncludePath>..\..\common\include;..\..\math\math;$(OPENCV_PATH)\include;$(IncludePath);</IncludePath>
+    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(OPENCV_PATH)\x64\vc12\lib;$(LibraryPath);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <OpenCVLib Condition="$(HasOpenCV)">opencv_world300.lib</OpenCVLib>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>CNTKMath.lib;$(OpenCVLib);%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command Condition="$(HasOpenCV)">xcopy /Y $(OPENCV_PATH)\x64\vc12\bin\opencv_world300.dll $(TargetDir)</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\Include\basetypes.h" />
+    <ClInclude Include="..\..\Common\Include\DataReader.h" />
+    <ClInclude Include="..\..\Common\Include\File.h" />
+    <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="ImageReader.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\ConfigFile.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\DataReader.cpp" />
+    <ClCompile Include="..\..\Common\File.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\DebugUtil.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="dllmain.cpp" />
+    <ClCompile Include="Exports.cpp">
+      <ExcludedFromBuild Condition="!$(HasOpenCV)">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="ImageReader.cpp">
+      <ExcludedFromBuild Condition="!$(HasOpenCV)">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <PrecompiledHeader>Create</PrecompiledHeader>
+    </ClCompile>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+  <Target Name="CheckDependencies">
+    <Warning Condition="!$(HasOpenCV)"
+      Text="ImageReader requires OpenCV library v3.0 or higher to build. Please install the library from http://opencv.org/downloads.html and set OPENCV_PATH environment variable to OpenCV build folder (e.g. C:\src\opencv\build)." />
+  </Target>
+</Project>
--- a/DataReader/ImageReader/ImageReader.vcxproj.filters
+++ b/DataReader/ImageReader/ImageReader.vcxproj.filters
@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="Exports.cpp" />
+    <ClCompile Include="stdafx.cpp" />
+    <ClCompile Include="..\..\Common\DataReader.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\ConfigFile.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="dllmain.cpp" />
+    <ClCompile Include="ImageReader.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="..\..\Common\Include\basetypes.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\DataReader.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\File.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\fileutil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="ImageReader.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Common">
+      <UniqueIdentifier>{0D0EFA10-72A8-4078-840A-B7F76AFEC0A4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common\Include">
+      <UniqueIdentifier>{C6F55578-121A-4D7C-8F57-4172BC5C463B}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/DataReader/ImageReader/dllmain.cpp
+++ b/DataReader/ImageReader/dllmain.cpp
@ -0,0 +1,21 @@
+//
+// <copyright file="dllmain.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// dllmain.cpp : Defines the entry point for the DLL application.
+#include "stdafx.h"
+
+BOOL APIENTRY DllMain(HMODULE /*hModule*/, DWORD  ul_reason_for_call, LPVOID /*lpReserved*/)
+{
+    switch (ul_reason_for_call)
+    {
+    case DLL_PROCESS_ATTACH:
+    case DLL_THREAD_ATTACH:
+    case DLL_THREAD_DETACH:
+    case DLL_PROCESS_DETACH:
+        break;
+    }
+    return TRUE;
+}
+
--- a/DataReader/ImageReader/stdafx.cpp
+++ b/DataReader/ImageReader/stdafx.cpp
@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// ParseNumber.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- a/DataReader/ImageReader/stdafx.h
+++ b/DataReader/ImageReader/stdafx.h
@ -0,0 +1,17 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#include "Platform.h"
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms
+#include "targetver.h"
+#ifdef __WINDOWS__
+#include "windows.h"
+#endif
+#include <stdio.h>
+#include <math.h>
+
+// TODO: reference additional headers your program requires here
--- a/DataReader/ImageReader/targetver.h
+++ b/DataReader/ImageReader/targetver.h
@ -0,0 +1,9 @@
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#ifdef __WINDOWS__
+#include <SDKDDKVer.h>
+#endif
--- a/ExampleSetups/Image/ImageNet/AlexNet/AlexNet.config
+++ b/ExampleSetups/Image/ImageNet/AlexNet/AlexNet.config
@ -0,0 +1,102 @@
+WorkDir=.
+ModelDir=$WorkDir$/_out/$ConfigName$
+stderr=$WorkDir$/_out/$ConfigName$
+
+ndlMacros=$WorkDir$/Macros.ndl
+
+precision=float
+deviceId=Auto
+
+command=Train:AddTop5Eval:Test
+
+Train=[
+    action=train
+    modelPath=$ModelDir$/AlexNet
+    traceLevel=1
+
+    NDLNetworkBuilder=[
+        networkDescription=$WorkDir$/AlexNet.ndl
+    ]
+    
+    SGD=[
+        epochSize=0
+        minibatchSize=128
+        learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
+        momentumPerMB=0.9
+        maxEpochs=90
+        gradUpdateType=None
+        L2RegWeight=0.0005
+        dropoutRate=0*5:0.5
+        
+        numMBsToShowResult=10
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        # Map file which maps images to labels using the following format:
+        # <full path to image><tab><numerical label (0-based class id)>
+        # Example:
+        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
+        file=$WorkDir$/train_map.txt
+        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
+        randomize=Auto
+        features=[
+            # Below are the required parameters.
+            width=224
+            height=224
+            channels=3
+            # Below are the optional parameters.
+            # Possible values: Center, Random. Default: Center
+            cropType=Random
+            # Horizontal random flip, will be enabled by default if cropType=Random
+            #hflip=0
+            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
+            cropRatio=0.9
+            # Crop scale ratio jitter type.
+            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            jitterType=UniRatio
+            # Interpolation to use when scaling image to width x height size.
+            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
+            interpolations=Linear
+            # Stores mean values for each pixel in OpenCV matrix XML format.
+            meanFile=$WorkDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
+
+AddTop5Eval=[    
+    action=edit
+    CurModel=$ModelDir$/AlexNet
+    NewModel=$ModelDir$/AlexNet.Top5
+    editPath=$WorkDir$/add_top5_layer.mel
+]
+
+Test=[
+    action=test
+    modelPath=$ModelDir$/AlexNet.Top5
+    # Set minibatch size for testing.
+    minibatchSize=128
+
+     NDLNetworkBuilder=[
+        networkDescription=$WorkDir$/AlexNet.ndl
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        file=$WorkDir$/val_map.txt
+        randomize=None
+        features=[
+            width=224
+            height=224
+            channels=3
+            cropType=Center
+            meanFile=$WorkDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
--- a/ExampleSetups/Image/ImageNet/AlexNet/AlexNet.ndl
+++ b/ExampleSetups/Image/ImageNet/AlexNet/AlexNet.ndl
@ -0,0 +1,108 @@
+load=ndlMnistMacros
+run=DNN
+
+ndlMnistMacros = [
+    ImageW = 224
+    ImageH = 224
+    ImageC = 3
+    LabelDim = 1000
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    labels = Input(LabelDim, tag = label)
+    
+    conv1WScale = 0.95
+    conv1BValue = 0
+    conv2WScale = 2
+    conv2BValue = 1
+    conv3WScale = 2.07
+    conv3BValue = 0
+    conv4WScale = 2.9
+    conv4BValue = 1
+    conv5WScale = 2.4
+    conv5BValue = 1
+    fc1WScale = 6.4
+    fc1BValue = 1
+    fc2WScale = 3.2
+    fc2BValue = 1
+    fc3WScale = 3.2
+    fc3BValue = 1
+]
+
+DNN=[
+    # conv1
+    kW1 = 11
+    kH1 = 11
+    cMap1 = 64
+    hStride1 = 3
+    vStride1 = 3
+    # weight[cMap1, kW1 * kH1 * ImageC]
+    conv1_act = ConvReLULayer(features, cMap1, 363, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
+    
+    # pool1
+    pool1W = 3
+    pool1H = 3
+    pool1hStride = 2
+    pool1vStride = 2
+    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
+
+    # conv2
+    kW2 = 5
+    kH2 = 5
+    cMap2 = 192
+    hStride2 = 1
+    vStride2 = 1
+    # weight[cMap2, kW2 * kH2 * cMap1]
+    conv2_act = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
+
+    # pool2
+    pool2W = 3
+    pool2H = 3
+    pool2hStride = 2
+    pool2vStride = 2
+    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
+
+    # conv3
+    kW3 = 3
+    kH3 = 3
+    cMap3 = 384
+    hStride3 = 1
+    vStride3 = 1
+    # weight[cMap3, kW3 * kH3 * cMap2]
+    conv3_act = ConvReLULayer(pool2, cMap3, 1728, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
+    
+    # conv4
+    kW4 = 3
+    kH4 = 3
+    cMap4 = 256
+    hStride4 = 1
+    vStride4 = 1
+    # weight[cMap4, kW4 * kH4 * cMap3]
+    conv4_act = ConvReLULayer(conv3_act, cMap4, 3456, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
+
+    # conv5
+    kW5 = 3
+    kH5 = 3
+    cMap5 = 256
+    hStride5 = 1
+    vStride5 = 1
+    # weight[cMap5, kW5 * kH5 * cMap4]
+    conv5_act = ConvReLULayer(conv4_act, cMap5, 2304, kW5, kH5, hStride5, vStride5, conv5WScale, conv5BValue)
+    
+    # pool3
+    pool3W = 3
+    pool3H = 3
+    pool3hStride = 2
+    pool3vStride = 2
+    pool3 = MaxPooling(conv5_act, pool3W, pool3H, pool3hStride, pool3vStride)
+
+    hiddenDim = 4096
+    h1 = DNNReLULayer(16384, hiddenDim, pool3, fc1WScale, fc1BValue)
+    h1_d = Dropout(h1)
+    h2 = DNNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
+    h2_d = Dropout(h2)
+    ol = DNNLastLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
--- a/ExampleSetups/Image/ImageNet/AlexNet/Macros.ndl
+++ b/ExampleSetups/Image/ImageNet/AlexNet/Macros.ndl
@ -0,0 +1,25 @@
+ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+{
+    convW = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
+    convB = Parameter(outMap, 1, init = fixedValue, value = bValue)
+    convPlusB = Plus(conv, convB);
+    act = RectifiedLinear(convPlusB);
+}
+
+DNNReLULayer(inDim, outDim, x, wScale, bValue)
+{
+    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = Parameter(outDim, init = fixedValue, value = bValue) 
+    t = Times(W, x)
+    z = Plus(t, b)
+    y = RectifiedLinear(z)
+}
+
+DNNLastLayer(hiddenDim, labelDim, x, wScale, bValue)
+{
+    W = Parameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
+    b = Parameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+}
--- a/ExampleSetups/Image/ImageNet/AlexNet/add_top5_layer.mel
+++ b/ExampleSetups/Image/ImageNet/AlexNet/add_top5_layer.mel
@ -0,0 +1,6 @@
+m1=LoadModel($CurModel$, format=cntk)
+SetDefaultModel(m1)
+
+ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)
+
+SaveModel(m1, $NewModel$, format=cntk)
--- a/MachineLearning/CNTK/CNTK.cpp
+++ b/MachineLearning/CNTK/CNTK.cpp
@ -1505,8 +1505,8 @@ int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper

        //dump config info
        fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
-        fprintf(stderr, "command line options: \n");
-        for (int i = 1; i < argc; i++)
+        fprintf(stderr, "command line: \n");
+        for (int i = 0; i < argc; i++)
        {
            fprintf(stderr, "%s ", WCharToString(argv[i]).c_str());
        }
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@ -40,7 +40,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        ParallelNode(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name)
-        {}
+        { }

        virtual void ComputeInputPartial(const size_t inputIndex)
        {
@ -201,13 +201,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class PreComputedNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>
    {
-        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembers;
+        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembers; using Base::OperationName;
    public:
        //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
        PreComputedNode(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name),
            m_hasComputed(false)
-        {}
+        { }

        // interface through which this node is operated on are these two functions

@ -234,7 +234,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void LoadFromFile(File& fstream, size_t modelVersion) override
        {
            Base::LoadFromFile(fstream, modelVersion);
-
            fstream >> m_hasComputed;
            CreateMatrixIfNull(m_functionValues);
            fstream >> FunctionValues();
@ -257,11 +256,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::Validate(isFinalValidationPass);

+            if (!Inputs(0)->HasMBLayout())
+                InvalidArgument("%ls %ls operation requires its input to come in minibatches of samples.", NodeName().c_str(), OperationName().c_str());
+            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
            if (!m_hasComputed) // this node retains state, and state gets destroyed by Resize(), so we must be careful
                Resize(Inputs(0)->GetNumRows(), 1);
            else
                VerifySize(Inputs(0)->GetNumRows(), 1);
-            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
            InferImageDimsFromInputs();
        }

@ -373,16 +374,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (!IsAccumulating())
                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());

-            Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
-            Matrix<ElemType> &avg = FunctionValues();
+            // set gaps to zero, since we are reducing in time
+            Inputs(0)->MaskMissingValuesColumnsToZero();
+
+            auto & samples = Inputs(0)->FunctionValues();
+            auto & avg = FunctionValues();

 #if 1//NANCHECK
            samples.HasNan("Mean-Samples");
 #endif
-            size_t numNewSamples = samples.GetNumCols();
-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + samples.GetNumCols()), samples, false,
-                                                        ConstOnes(numNewSamples, 1, samples.GetDeviceId()),
-                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSamples), avg);
+            size_t numNewSamples = Inputs(0)->GetMBLayout()->DetermineActualNumSamples();
+            size_t totalNumSamples = m_numSamples + numNewSamples;
+            if (totalNumSamples == 0) totalNumSamples = 1;  // 0/0=1 in this context
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
+                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     false, (ElemType)m_numSamples / totalNumSamples, avg);
 #if 1//NANCHECK
            avg.HasNan("Mean-avg");
 #endif
@ -453,15 +459,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (!IsAccumulating())
                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());

-            Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
+            // set gaps to zero, since we are reducing in time
+            Inputs(0)->MaskMissingValuesColumnsToZero();
+
+            auto & samples = Inputs(0)->FunctionValues();
 #if 1//NANCHECK
            samples.HasNan("InvStdDev-Samples");
 #endif
            m_temp.SetValue(m_mean);
-            size_t numNewSample = samples.GetNumCols();
-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), samples, false,
-                                                        ConstOnes(numNewSample, 1, samples.GetDeviceId()),
-                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_mean);
+            size_t numNewSamples = Inputs(0)->GetMBLayout()->DetermineActualNumSamples();
+            size_t totalNumSamples = m_numSamples + numNewSamples;
+            if (totalNumSamples == 0) totalNumSamples = 1;  // 0/0=1 in this context
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
+                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     false, (ElemType)m_numSamples / totalNumSamples, m_mean);

            m_temp -= m_mean;
            m_temp.AssignElementPowerOf(m_temp, 2);
@ -470,9 +481,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_temp.AssignDifferenceOf(samples, m_mean);
            m_temp.AssignElementPowerOf(m_temp, 2);

-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), m_temp, false,
-                                                        ConstOnes(numNewSample, 1, samples.GetDeviceId()),
-                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_var);
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, m_temp, false,
+                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     false, (ElemType)m_numSamples / totalNumSamples, m_var);

 #if 1//NANCHECK
            m_var.HasNan("InvStdDev-m_var");
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -330,11 +330,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }
        // helper functions for common cases
-
-
    private:
        ComputationNodeBasePtr Inputs(size_t index) const { return m_children[index]; } // TODO: delete this; change to m_children
-       // determine number of columns from a child and/or layout
+        // determine number of columns from a child and/or layout
        size_t DetermineNumCols(const ComputationNodeBasePtr & child) const
        {
            size_t childCols = child->GetNumCols();     // this is what the child says
@ -495,7 +493,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        pair<ImageLayout, ImageLayout> GetImageLayouts() const { return make_pair(m_inputImageLayout, m_outputImageLayout); }

-        const size_t ChildrenSize() const { return m_children.size(); }
+        const size_t ChildrenSize() const { return m_children.size(); }     // TODO: rename to NumChildren() or NumInputs(); and inside here where we use m_children, use m_children.size() as well

        virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;

@ -515,7 +513,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void MaskMissingGradientColumnsToZero() = 0;
        virtual void MaskMissingGradientColumnsToZero(const size_t timeIdxInSeq) = 0; // TODO: don't we need a FrameRange here, too?

-
        // indicates whether special handling is needed.The standard handleing will be just mask the function values after the evalaution and mask the gradient before gradiant computation for the children. this is not valid for all criterion nodes whose result is a scalar.
        // overridden to return true by training/eval criteria (and the soon-to-be-deprecated PairNetworkNode, LSTMNode)
        // The need for this seems an artifact of the old inconsistent layout architecture. In the future, this can probably just go away.
@ -844,7 +841,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        template<class C, class... _Types> static inline shared_ptr<C> New(DEVICEID_TYPE deviceId, const wstring & name, _Types&&... _Args)
        {
            auto p = make_shared<C>(deviceId, name, forward<_Types>(_Args)...);     // creates objects, esp. assigns deviceId to matrices, but otherwise does nothing
-
            //disable this line. Instead we should make sure matrices are allocated at the right device
            //p->MoveMatricesToDevice(deviceId);                                      // this is a virtual call, i.e. it will handle extra matrices an object might own
            return p;
@ -931,7 +927,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                ReleaseMatrixToPool(m_functionValues, matrixPool);
            }
        }
-
        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const;

        // TODO: similar to DumpInfo; used by ExperimentalNetworkBuilder test implementation
@ -1172,9 +1167,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        inline ComputationNodePtr Inputs(const size_t childIndex) const       // TODO: rename to Input
        {
-#ifdef DEBUG // profile shows this is range check very expensive in release mode, skip it  
+#ifdef _DEBUG // profile shows this is range check very expensive in release mode, skip it  
            if (childIndex >= m_children.size())
-                InvalidArgument ("childIndex is out of range.");
+                LogicError("Inputs: childIndex is out of range.");
 #endif
            return UpCast(m_children[childIndex]);
        }
@ -1207,7 +1202,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Note: This returns an object, not a reference. That object is a column slice, i.e. a small object that just points into another object.
        // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
        // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
-        Matrix<ElemType> DataSlice(Matrix<ElemType>& data,
+        Matrix<ElemType> DataSlice(Matrix<ElemType> & data,
                                   const FrameRange & frameRange/*select frame or entire batch*/)
        {
            return DataSlice(data, frameRange, m_pMBLayout);
@ -1250,7 +1245,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    return data.ColumnSlice(startColumn + frameRange.seqIndex, 1);
            }
        }
-
        enum ValueOrGradient { VALUE, GRADIENT };
        Matrix<ElemType> DataSlice(ValueOrGradient valueOrGradient/*as it says*/,
                                   const FrameRange & frameRange/*select frame or entire batch*/)
@ -1351,7 +1345,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        child->ClearGradient(true);
                        child->MarkGradientInitialized(true);
                    }
-
                    ComputeInputPartial(i, FrameRange(timeIdxInSeq)); //this computes partial wrt to the child and sums the gradient value in the child
                }
 #ifdef DISPLAY_DEBUG
@ -1364,16 +1357,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        /*implement*/void ClearGradientForChildren()
        {
-            for (size_t i = 0; i<m_children.size(); i++)
+            for (size_t i=0; i<m_children.size(); i++)
            {
                ComputationNodePtr child = Inputs(i);
                child->MarkGradientInitialized(false);
            }
        }
        virtual void ClearGradient(const bool clearExistingGradientValue)
-        {
+                {
            if (NeedGradient())
-            {
+                    {
                //ClearChildGradientComputationFlag();

                if (clearExistingGradientValue)
@ -1436,7 +1429,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
            }
        }
-
        //to be called by derived classed if that class needs to print node values
        void PrintNodeValuesToFile(const bool printValues, File& fstream) const
        {
@ -1577,7 +1569,6 @@ public: \
    using Base::RequestMatricesBeforeGradientComp; using Base::ReleaseMatricesAfterGradientComp; \
    using Base::Validate; using Base::ValidateUnaryMap; using Base::ValidateBinaryZip; using Base::ValidateUnaryReduce; using Base::ValidateBinaryReduce; using Base::ValidateInferBinaryChildrenDims; using Base::ValidateInferChildDims

-
 #define ComputationNodeBoilerplate \
 protected:    /* some boilerplate goes here */ \
    virtual const std::wstring OperationName() const override { return TypeName(); } \
--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@ -21,7 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------

    template<class ElemType>
-    class ErrorPredictionNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>, public NumInputs<2>
+    class ErrorPredictionNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>
    {
        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
        static const std::wstring TypeName() { return L"ErrorPrediction"; }
@ -41,16 +41,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), *m_maxIndexes0, *m_maxIndexes1, *m_maxValues, shared_from_this());
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), *m_maxIndexes0, *m_maxIndexes1, *m_maxValues, m_topK, shared_from_this());
        }

-        void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& maxIndexes0, Matrix<ElemType>& maxIndexes1, Matrix<ElemType>& maxValues, ComputationNodePtr curNode)
+        void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& maxIndexes0, Matrix<ElemType>& maxIndexes1, Matrix<ElemType>& maxValues, int topK, ComputationNodePtr curNode)
        {
            inputFunctionValues0.VectorMax(maxIndexes0, maxValues, true);
-            inputFunctionValues1.VectorMax(maxIndexes1, maxValues, true);
+            inputFunctionValues1.VectorMax(maxIndexes1, maxValues, true, topK);
            curNode->MaskMissingColumnsToZero(maxIndexes0, Inputs(0)->GetMBLayout());   // we are fine since it will only be called with full minibatch
            curNode->MaskMissingColumnsToZero(maxIndexes1, Inputs(1)->GetMBLayout());
-            functionValues.AssignNumOfDiff(maxIndexes0, maxIndexes1);
+            functionValues.AssignNumOfDiff(maxIndexes0, maxIndexes1, topK > 1);
        #if NANCHECK
            functionValues.HasNan("ErrorPrediction");
        #endif
@ -77,18 +77,38 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                ValidateInferChildDims(index, rows, cols);
            }

+            m_topK = 1;
+            if (m_children.size() == 3)
+            {
+                if (Inputs(2)->FunctionValues().GetNumRows() != 1 || Inputs(2)->FunctionValues().GetNumCols() != 1)
+                    throw std::logic_error("TopK in ErrorPredictionNode must be a scalar value.");
+                m_topK = static_cast<int>(Inputs(2)->FunctionValues().Get00Element());
+            }
            //if (Inputs(0)->GetNumRows() == 0 || Inputs(1)->GetNumRows() == 0)
            //    LogicError("ErrorPrediction operation: one of the operands has 0 elements.");

            if (isFinalValidationPass)
+            {
                if (!(Inputs(0)->GetNumRows() == Inputs(1)->GetNumRows() && Inputs(0)->GetNumCols() == Inputs(1)->GetNumCols()))
                {
                    LogicError("The Matrix dimension in the ErrorPrediction operation does not match.");
                }       

+                if (((!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() &&  //match size
+                    Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()))) && Inputs(0)->GetLoopId() < 0)
+                {
+                    LogicError("The Matrix dimension in the ErrorPrediction operation does not match.");
+                }
+            }
            Resize(1,1);
            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
            InferImageDimsFromInputs(); 
+
+            // resize the temporaries to their proper size
+            size_t cols = Inputs(0)->GetNumCols();
+            m_maxIndexes0->Resize(m_topK, cols);
+            m_maxIndexes1->Resize(m_topK, cols);
+            m_maxValues->Resize(m_topK, cols);
        }

        virtual void InferImageDimsFromInputs()
@ -117,7 +137,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                *node->m_maxValues = *m_maxValues;
            }
        }
-
        //request matrices needed to do node function value evaluation
        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool)
        {
@ -136,13 +155,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ReleaseMatrixToPool(m_maxIndexes1, matrixPool);
            ReleaseMatrixToPool(m_maxValues, matrixPool);
        }
-
 protected:
        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }

    private:
        shared_ptr<Matrix<ElemType>> m_maxIndexes0, m_maxIndexes1;
        shared_ptr<Matrix<ElemType>> m_maxValues;
+        int m_topK;
    };

    template class ErrorPredictionNode<float>; 
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@ -381,11 +381,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // stacks multiple inputs on top of each other
    // -----------------------------------------------------------------------

-    //this node is used to extract part of the input by rows as the output
-    // TODO: Really? RowStack indicates something different.
-    //it has to be continuous segments of rows since each column is treated as one sample
    template<class ElemType>
-    class RowStackNode : public ComputationNode<ElemType>   // note: not deriving from NumInputs<> like most other nodes since this one takes a variable number of inputs
+    class RowStackNode : public ComputationNode<ElemType>   // note: not deriving from NumInputs<> like most other nodes, because this one takes a variable number of inputs
    {
        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
        static const std::wstring TypeName() { return L"RowStack"; }
@ -397,21 +394,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
        {
            Base::CopyTo(nodeP, newName, flags);
-            auto node = dynamic_pointer_cast<RowStackNode<ElemType>>(nodeP);

            if (flags & CopyNodeFlags::copyNodeChildren)
            {
-                node->m_children = m_children;
+                auto node = dynamic_pointer_cast<RowStackNode<ElemType>>(nodeP);
                node->m_startRowIndices = m_startRowIndices;
-                node->m_inputMatrices = m_inputMatrices;
            }
        }

        virtual void ComputeInputPartial(const size_t inputIndex)
        {
-            if (inputIndex >= ChildrenSize())
-                InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
-            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndices[inputIndex], m_startRowIndices[inputIndex + 1] - m_startRowIndices[inputIndex]);
+            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndices[inputIndex]);
        }

        virtual void /*ComputationNode::*/ComputeInputPartial(const size_t inputIndex, const FrameRange & frameRange) override
@ -419,62 +412,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check_t(GetNumParallelSequences(), m_pMBLayout));
            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check_t(GetNumParallelSequences(), m_pMBLayout));

-            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndices[inputIndex], m_startRowIndices[inputIndex+1] - m_startRowIndices[inputIndex]);
+            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndices[inputIndex]);
        }

-        /*TODO: merge with call site*/void ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex, const size_t numRows)
+        /*TODO: merge with call site*/void ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex)
        {
-            inputGradientValues.AddWithRowSliceValuesOf(gradientValues, startIndex, numRows);
-        }
-
-        void EvaluateThisNodeMap()    // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-        {
-            EvaluateThisNodeS(FunctionValues(), m_inputMatrices,  0, Inputs(0)->GetNumCols());
+            inputGradientValues.AddWithRowSliceValuesOf(gradientValues, startIndex, inputGradientValues.GetNumRows());
        }

        virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) override
        {
-            //if (frameRange.IsAllFrames()) { EvaluateThisNodeMap(); return; }
-            Matrix<ElemType> sliceFunctionValues = ValueSlice(frameRange/*TODO: delete this:*/.Check_t(GetNumParallelSequences(), m_pMBLayout));
-
-            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-        }
-
-        // TODO: change to FrameRange
-        /*TODO: merge with call site*/void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
-        {
-            functionValues.AssignRowStackValuesOf(inputMatrices, sliceStartCol, sliceNumCols);
-#if NANCHECK
-            functionValues.HasNan("RowStack");
-#endif
+            for (size_t i = 0; i < ChildrenSize(); i++)
+                ValueSlice(frameRange).AssignToRowSliceValuesOf(Inputs(i)->ValueSlice(frameRange), m_startRowIndices[i], Inputs(i)->GetNumRows());
        }

        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
        {
            Base::Validate(isFinalValidationPass);
+            InferMBLayoutFromInputsForStandardCase();

            size_t numCols = Inputs(0)->GetNumCols();
-            m_startRowIndices.resize(ChildrenSize()+1);
-            m_inputMatrices.resize(ChildrenSize());

+            // count totalRows and form m_startRowIndices[] array, which is the cumulative sum of matrix heights
+            m_startRowIndices.resize(ChildrenSize());
            size_t totalRows = 0;
-            m_startRowIndices[0] = 0;

            for (int i = 0; i < ChildrenSize(); i++)
            {
-                Matrix<ElemType>& childMatrix = Inputs(i)->FunctionValues();
-                size_t numRows = childMatrix.GetNumRows();
-                
-                if (isFinalValidationPass && childMatrix.GetNumCols() != numCols)
+                if (isFinalValidationPass && Inputs(i)->GetNumCols() != numCols)
                    LogicError("RowStack operation: the input node %ls has different number of columns.", Inputs(i)->NodeName().c_str());

-                totalRows += numRows;
-                m_inputMatrices[i] = &childMatrix;
-                m_startRowIndices[i + 1] = m_startRowIndices[i] + numRows;
+                m_startRowIndices[i] = totalRows;
+                totalRows += Inputs(i)->GetNumRows();
            }

            Resize(totalRows, numCols);
-            InferMBLayoutFromInputsForStandardCase();
            InferImageDimsFromInputs();
        }

@ -489,8 +461,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

    private:
-        std::vector<size_t> m_startRowIndices; //start row number in the stacked matrix of each input (child)
-        std::vector<const Matrix<ElemType>*> m_inputMatrices;
+        std::vector<size_t> m_startRowIndices;  // start row number in the stacked matrix of each input (child) (cumsum of matrix heights)
    };

    template class RowStackNode<float>;
@ -1823,7 +1794,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                *node->m_rightGradient = *m_rightGradient;
            }
        }
-
        //request matrices that are needed for gradient computation
        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool)
        {
@ -1839,7 +1809,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ReleaseMatrixToPool(m_innerproduct, matrixPool);
            ReleaseMatrixToPool(m_rightGradient, matrixPool);
        }
-
 private:
        shared_ptr<Matrix<ElemType>> m_innerproduct;
        shared_ptr<Matrix<ElemType>> m_rightGradient;
@ -2034,7 +2003,6 @@ private:
                *node->m_temp = *m_temp;
            }
        }
-
        //request matrices needed to do node function value evaluation
        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool)
        {
@ -2062,7 +2030,6 @@ private:
            ReleaseMatrixToPool(m_rightTerm, matrixPool);
            ReleaseMatrixToPool(m_temp, matrixPool);
        }
-
 private:
        // invNorm nodes tranfer data between EvaluateThisNode and ComputeInputPartial
        shared_ptr<Matrix<ElemType>> m_invNorm0;
@ -2437,7 +2404,6 @@ private:
                *node->m_temp = *m_temp;
            }
        }
-
        //request matrices needed to do node function value evaluation
        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool)
        {
@ -2467,7 +2433,6 @@ private:
            ReleaseMatrixToPool(m_invNormSquare, matrixPool);
            ReleaseMatrixToPool(m_temp, matrixPool);
        }
-
 private:
        // invNorm nodes tranfer data between EvaluateThisNode and ComputeInputPartial
        shared_ptr<Matrix<ElemType>> m_invNorm0;
--- a/32
+++ b/32
@ -21,8 +21,12 @@
 #   defaults to acml
 # CUDA_PATH= Path to CUDA
 #   If not specified, GPU will not be enabled
+# CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
+#   defaults to /usr/local/cub-1.4.1
 # KALDI_PATH= Path to Kaldi
 #   If not specified, Kaldi plugins will not be built
+# OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
+#   defaults to /usr/local/opencv-3.0.0

 ifndef BUILD_TOP
 BUILD_TOP=.
@ -77,7 +81,12 @@ ifdef CUDA_PATH
  ifndef GDK_PATH
    $(info defaulting GDK_PATH to /usr)
    GDK_PATH=/usr
-endif
+  endif
+
+  ifndef CUB_PATH
+    $(info defaulting CUB_PATH to /usr/local/cub-1.4.1)
+    CUB_PATH=/usr/local/cub-1.4.1
+  endif

  DEVICE = gpu

@ -85,6 +94,7 @@ endif

  # This is a suggested/default location for NVML
  INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
+  INCLUDEPATH+=$(CUB_PATH)
  NVMLPATH=$(GDK_PATH)/src/gdk/nvml/lib

 # Set up CUDA includes and libraries
@ -367,6 +377,26 @@ $(KALDI2READER): $(KALDI2READER_OBJ) | $(CNTKMATH_LIB)

 endif

+########################################
+# ImageReader plugin
+########################################
+
+ifdef OPENCV_PATH
+IMAGEREADER_SRC =\
+	DataReader/ImageReader/Exports.cpp \
+	DataReader/ImageReader/ImageReader.cpp \
+	
+IMAGEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(IMAGEREADER_SRC))
+
+IMAGEREADER:=$(LIBDIR)/ImageReader.so
+ALL += $(IMAGEREADER)
+SRC+=$(IMAGEREADER_SRC)
+
+$(IMAGEREADER): $(IMAGEREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+endif
+
 ########################################
 # cntk
 ########################################
--- a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
@ -597,6 +597,7 @@ namespace CNTKMathTest
            M4(1, 0) = 8; M4(1, 1) = 18; M4(1, 2) = 28;
            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));

+#if 0
            Matrix M5, M6, M7, M8;
            M5.AssignRowSliceValuesOf(M0, 0, 2);
            M6.AssignRowSliceValuesOf(M0, 2, 1);
@ -610,6 +611,7 @@ namespace CNTKMathTest
            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
            
            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
+#endif
        }

        TEST_METHOD(CPUAssignRepeatOf)
--- a/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
@ -316,6 +316,7 @@ namespace CNTKMathTest
            GPUMatrix<float> M4(2, 3, fArray4, matrixFlagNormal);
            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));

+#if 0
            GPUMatrix<float>  M5, M6, M7, M8;
            M5.AssignRowSliceValuesOf(M0, 0, 2);
            M6.AssignRowSliceValuesOf(M0, 2, 1);
@ -329,6 +330,7 @@ namespace CNTKMathTest
            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);

            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
+#endif
        }

        TEST_METHOD(GPUKhatriRaoProduct)
--- a/Math/CNTKMathTest/MatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/MatrixUnitTests.cpp
@ -821,5 +821,60 @@ namespace CNTKMathTest
                Assert::IsTrue(SingleMatrix::HasElement(m1, posInf));
            }
        }
+
+        TEST_METHOD(MatrixVectorMax)
+        {
+            // Matrices are stored as column-major so below is 3x2 matrix.
+            float src[] = { 
+                1.0f, 3.0f, 4.0f,
+                6.0f, 2.0f, 5.0f };
+
+            float expectedIdx[] = {
+                2.0f, 1.0f,
+                0.0f, 2.0f };
+
+            float expectedVal[] = {
+                4.0f, 3.0f,
+                6.0f, 5.0f };
+
+            for (int deviceId : { -1, AUTOPLACEMATRIX })
+            {
+                Matrix<float> expIdx(2, 2, expectedIdx, matrixFlagNormal, deviceId);
+                Matrix<float> expVal(2, 2, expectedVal, matrixFlagNormal, deviceId);
+
+                Matrix<float> actual(3, 2, src, matrixFlagNormal, deviceId);
+                Matrix<float> actualIdx(deviceId);
+                Matrix<float> actualVal(deviceId);
+
+                int topK = 2;
+                actual.VectorMax(actualIdx, actualVal, true, topK);
+                Assert::IsTrue(actualIdx.IsEqualTo(expIdx));
+                Assert::IsTrue(actualVal.IsEqualTo(expVal));
+            }
+        }
+
+        TEST_METHOD(MatrixAssignNumOfDiff)
+        {
+            float labels[] = { 1.0f, 2.0f, 3.0f };
+
+            // Matrices are stored as column-major so below is 2x3 matrix.
+            float topKResults[] = { 
+                1.0f, 3.0f, 
+                4.0f, 6.0f, 
+                2.0f, 3.0f };
+
+            for (int deviceId : { -1, AUTOPLACEMATRIX })
+            {
+                Matrix<float> lbl(1, 3, labels, matrixFlagNormal, deviceId);
+                Matrix<float> topKRes(2, 3, topKResults, matrixFlagNormal, deviceId);
+
+                Matrix<float> actual(deviceId);
+                actual.AssignNumOfDiff(lbl, topKRes, true);
+
+                float expectedDiff = 1.0;
+                Assert::AreEqual(expectedDiff, actual.Get00Element());
+            }
+        }
+
    };
 }
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -24,7 +24,7 @@
    </SccProvider>
    <ProjectName>CNTKMathCUDA</ProjectName>
    <CudaPath>$(CUDA_PATH_V7_0)</CudaPath>
-    <CudaToolkitCustomDir>$(CUDA_PATH)</CudaToolkitCustomDir>
+    <CudaToolkitCustomDir>$(CudaPath)</CudaToolkitCustomDir>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
@ -47,7 +47,7 @@
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
-    <IncludePath>..\..\common\include;$(ACML_PATH)\include;$(CudaPath)\include;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\common\include;$(ACML_PATH)\include;$(CudaPath)\include;$(CUB_PATH);$(IncludePath)</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(ACML_PATH)\lib;$(CudaPath)\lib\$(Platform);$(LibraryPath)</LibraryPath>
    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
@ -187,4 +187,8 @@
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.0.targets" />
  </ImportGroup>
+  <Target Name="CheckDependencies">
+    <Error Condition="!Exists('$(CUB_PATH)')"
+      Text="CNTK requires NVIDIA CUB library v1.4.1 to build. Please download the library from https://nvlabs.github.io/cub/ and set CUB_PATH environment variable to CUB root path (e.g. c:\src\cub-1.4.1)." />
+  </Target>
 </Project>
--- a/Math/Math/CNTKMathCUDA.vcxproj.filters
+++ b/Math/Math/CNTKMathCUDA.vcxproj.filters
@ -31,9 +31,7 @@
    <ClCompile Include="cudalib.cpp">
      <Filter>GPU\SequenceTraining</Filter>
    </ClCompile>
-    <ClCompile Include="DebugUtil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
+    <ClCompile Include="..\..\Common\DebugUtil.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="CommonMatrix.h" />
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -20,6 +20,7 @@
 #include <exception>
 #include <thread>
 #include<iostream>
+#include <algorithm>
 #ifdef     _WIN32
 #include <Windows.h>
 #else
@ -477,6 +478,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return diag;
    }

+#if 0
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
@ -518,6 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return *this;
    }  
+#endif

    template<class ElemType>
    void CPUMatrix<ElemType>::MinusOneAt(CPUMatrix<ElemType>& c, const size_t position)
@ -3299,7 +3302,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }
    //I decided to use CPUMatrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
    template<class ElemType>
-    void CPUMatrix<ElemType>::VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise) const
+    void CPUMatrix<ElemType>::VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const
    {
        if (IsEmpty())
            LogicError("VectorMax: Matrix is empty.");
@ -3307,33 +3310,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;
        const int m = (int)GetNumRows();
        const int n = (int)GetNumCols();
+        assert(topK <= m);

        assert (m>0 && n>0); //converting from size_t to int may cause overflow

        if (isColWise)  //col-wise
        {
-            maxValues.Resize(1, n);
-            maxIndexes.Resize(1, n);
+            maxValues.Resize(topK, n);
+            maxIndexes.Resize(topK, n);

-#pragma omp parallel for
-            for (int j=0; j<n; j++)
+            if (topK == 1)
            {
-                ElemType v = us(0, j);
-                size_t index = 0;
-                foreach_row(i,us)
+#pragma omp parallel for
+                for (int j = 0; j < n; j++)
                {
-                    if (v < us(i,j))
+                    ElemType v = us(0, j);
+                    size_t index = 0;
+                    foreach_row(i, us)
                    {
-                        index = i;
-                        v = us(i,j);
+                        if (v < us(i, j))
+                        {
+                            index = i;
+                            v = us(i, j);
+                        }
+                    }
+                    maxValues(0, j) = v;
+                    maxIndexes(0, j) = (ElemType)index;
+                }
+            }
+            else
+            {
+                std::vector<int> indices(m);
+                int i = 0;
+                std::generate(indices.begin(), indices.end(), [&i] { return i++; });
+
+                const ElemType* curVal = m_pArray;
+                ElemType* curIdx = maxIndexes.m_pArray;
+                ElemType* curMax = maxValues.m_pArray;
+                for (int icol = 0; icol < n; icol++, curVal += m, curIdx += topK, curMax += topK)
+                {
+                    // Partial sort, descending order.
+                    std::nth_element(indices.begin(), indices.begin() + topK, indices.end(),
+                        [curVal](const int& a, const int& b) { return curVal[a] > curVal[b]; });
+                    // REVIEW alexeyk: the following produces warning (see SCL_SECURE_NO_WARNINGS) so use loop instead.
+                    //std::transform(indices.begin(), indices.begin() + topK, curIdx, [](const int& a) { return static_cast<ElemType>(a); });
+                    for (int i = 0; i < topK; i++)
+                    {
+                        curIdx[i] = static_cast<ElemType>(indices[i]);
+                        curMax[i] = curVal[indices[i]];
                    }
                }
-                maxValues(0,j) = v;                
-                maxIndexes(0,j) = (ElemType)index;                
            }
        }
        else
        {
+            if (topK > 1)
+                RuntimeError("Row-wise TopK max is not supported.");
+
            maxValues.Resize(m,1);
            maxIndexes.Resize(m, 1);

@ -3418,19 +3451,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    CPUMatrix<ElemType>&  CPUMatrix<ElemType>::AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b)
+    CPUMatrix<ElemType>&  CPUMatrix<ElemType>::AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, bool searchInCol)
    {
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            InvalidArgument("AssignNumOfDiff: a and b must have same dimension.");
-        
+        if (a.GetNumCols() != b.GetNumCols())
+            throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of columns.");
+        if (!searchInCol && a.GetNumRows() != b.GetNumRows())
+            throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of rows.");
+
        ElemType n = 0;
-        foreach_coord(i,j,a)
+        if (!searchInCol)
        {
-            n += (a(i,j) != b(i,j));
+            foreach_coord(i, j, a)
+            {
+                n += (a(i, j) != b(i, j));
+            }
+        }
+        else
+        {
+            size_t crow = b.GetNumRows();
+            const ElemType* curCol = b.m_pArray;
+            for (size_t icol = 0; icol < a.GetNumCols(); icol++, curCol += crow)
+            {
+                auto res = std::find(curCol, curCol + crow, a(0, icol));
+                if (res == curCol + crow)
+                    n++;
+            }
        }

-        Resize(1,1); //result should be one element
-        (*this)(0,0) = n;
+        Resize(1, 1); //result should be one element
+        (*this)(0, 0) = n;

        return *this;
    }
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -242,7 +242,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>&  AssignRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
-        CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
+        //CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        CPUMatrix<ElemType>& AssignToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);

@ -252,10 +252,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
        CPUMatrix<ElemType>&  AddFoldedPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
        
-        void VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMin(CPUMatrix<ElemType>& mainndexes, CPUMatrix<ElemType>& minValues, const bool isColWise) const;
+        void VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise, int topK = 1) const;
+        void VectorMin(CPUMatrix<ElemType>& minIndexes, CPUMatrix<ElemType>& minValues, const bool isColWise) const;

-        CPUMatrix<ElemType>&  AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b); 
+        CPUMatrix<ElemType>&  AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, bool searchInCol = false);

        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -485,6 +485,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    GPUMatrix<ElemType>::~GPUMatrix(void)
    {
        Clear();
+        if (m_workspace != nullptr)
+            delete m_workspace;
    }

    template<class ElemType>
@ -711,6 +713,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return diag;
    }
+#if 0
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
@ -767,6 +770,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return *this;
    }
+#endif

    /// c = c - 1.0 for a specific position
    template<class ElemType>
@ -1159,12 +1163,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    void GPUMatrix<ElemType>::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
    {
        PrepareDevice();
+#if 0   // to change the seed, we must tear down the random generator
+        // This is not efficient, but for correctness, we must do it.
+        if (s_curandGenerator && (seed != USE_TIME_BASED_SEED))
+        {
+            fprintf(stderr, "SetUniformRandomValue (GPU): destroying curand object\n");
+            CURAND_CALL(curandDestroyGenerator(((curandGenerator_t*)s_curandGenerator)[0]));    // TODO: what is this typecast business??
+            delete s_curandGenerator;
+            s_curandGenerator = NULL;
+        }
+#endif
        if (s_curandGenerator==NULL)
-        {            
+        {
+            unsigned long long cudaSeed = (seed == USE_TIME_BASED_SEED) ? time(NULL) : seed;
+            fprintf(stderr, "SetUniformRandomValue (GPU): creating curand object with seed %llu\n", cudaSeed);
            s_curandGenerator = new curandGenerator_t;
-            /* Create pseudo-random number generator */        
-            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
-            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
+            /* Create pseudo-random number generator */
+            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], cudaSeed));
            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
        }

@ -1197,12 +1213,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    void GPUMatrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
    {
        PrepareDevice();
+#if 0   // to change the seed, we must tear down the random generator
+        // This is not efficient, but for correctness, we must do it.
+        if (s_curandGenerator && (seed != USE_TIME_BASED_SEED))
+        {
+            fprintf(stderr, "SetGaussianRandomValue (GPU): destroying curand object\n");
+            CURAND_CALL(curandDestroyGenerator(((curandGenerator_t*)s_curandGenerator)[0]));    // TODO: what is this typecast business??
+            delete s_curandGenerator;
+            s_curandGenerator = NULL;
+        }
+#endif
        if (s_curandGenerator==NULL)
-        {            
+        {
+            unsigned long long cudaSeed = (seed == USE_TIME_BASED_SEED) ? time(NULL) : seed;
+            fprintf(stderr, "SetGaussianRandomValue (GPU): creating curand object with seed %llu\n", cudaSeed);
            s_curandGenerator = new curandGenerator_t;
            /* Create pseudo-random number generator */        
-            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
-            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
+            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); 
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], cudaSeed));
            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
        }

@ -2906,32 +2934,132 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (IsEmpty())
            LogicError("VectorMax: Matrix is empty.");

-        const GPUMatrix<ElemType>& us=*this;
+        const GPUMatrix<ElemType>& us = *this;
        const CUDA_LONG m = (CUDA_LONG)GetNumRows();
        const CUDA_LONG n = (CUDA_LONG)GetNumCols();
-        assert (m>0 && n>0); //converting from size_t to int may cause overflow
+        assert(m > 0 && n > 0); //converting from size_t to int may cause overflow
+
        PrepareDevice();
        cudaEvent_t done = nullptr;
-        if (do_sync)     CUDA_CALL(cudaEventCreate(&done));                
+        if (do_sync)     CUDA_CALL(cudaEventCreate(&done));
        if (isColWise)
        {
            maxValues.Resize(1, n);
            maxIndexes.Resize(1, n);

            int blocksPerGrid = n; //we'll have 1 block processing 1 column
-            _vectorMaxMinReduce<ElemType, true><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n);
+            _vectorMaxMinReduce<ElemType, true><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(us.m_pArray, maxIndexes.m_pArray, maxValues.m_pArray, m, n);

-            /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);*/
        }
        else
        {
            maxValues.Resize(m, 1);
            maxIndexes.Resize(m, 1);
-            int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock);  
-            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);
+            int blocksPerGrid = (int)ceil(1.0*m / threadsPerBlock);
+            _vectorMax<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(us.m_pArray, maxIndexes.m_pArray, maxValues.m_pArray, m, n, isColWise);
        }
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+    
+    __global__ void _initIndicesForSort(uint64_t* indexes, CUDA_LONG crow, CUDA_LONG ccol)
+    {
+        CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+        if (id >= crow * ccol)
+            return;
+        uint32_t irow = id % crow;
+        uint32_t icol = id / crow;
+        indexes[id] = (static_cast<uint64_t>(irow) << 32) | icol;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const
+    {
+        if (IsEmpty())
+            LogicError("VectorMax: Matrix is empty.");
+
+        if (topK == 1)
+        {
+            VectorMax(maxIndexes, maxValues, isColWise);
+            return;
+        }
+
+        if (!isColWise)
+            RuntimeError("Row-wise TopK max is not supported.");
+
+        const GPUMatrix<ElemType>& us = *this;
+        const CUDA_LONG m = (CUDA_LONG)GetNumRows();
+        const CUDA_LONG n = (CUDA_LONG)GetNumCols();
+        assert(topK <= m);
+        assert(m > 0 && n > 0); //converting from size_t to int may cause overflow
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)     CUDA_CALL(cudaEventCreate(&done));
+        maxValues.Resize(topK, n);
+        maxIndexes.Resize(topK, n);
+
+        // To sort matrix columns we use 2-pass _stable_ sort algorithm:
+        // 1. Sort by values (descending) with corresponding row/col indexes.
+        // 2. Sort by col indices (ascending) with corresponding values/row indices.
+        // Indices are stored as 64-bit ints where low 32 bits represent column and high 32 bits - row index.
+        // On the second pass only first 32 bits of the index are used in sorting, so SortPairs has
+        // begin_bit and end_bit set accordingly.
+
+        CUDA_LONG celt = static_cast<CUDA_LONG>(GetNumElements());
+        ElemType* inVal = us.m_pArray;
+        ElemType* outVal1 = nullptr;
+        ElemType* outVal2 = nullptr;
+        uint64_t* inIdx = nullptr;
+        uint64_t* outIdx = nullptr;
+        // Determine temp buffer size needed for SortPairsDescending to sort values on the first pass.
+        size_t cbtemp = 0;
+        // If first param is nullptr then no actual work is done except writing result to cbtemp.
+        CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(nullptr, cbtemp, inVal, outVal1, inIdx, outIdx, celt, 0, sizeof(ElemType) * 8, t_stream));
+        size_t ctemp1 = (cbtemp + sizeof(ElemType) - 1) / sizeof(ElemType);
+        // Determine temp buffer size needed for SortPairs to sort indices on the second pass.
+        cbtemp = 0;
+        CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, cbtemp, outIdx, inIdx, outVal1, outVal2, celt, 0, 32, t_stream));
+        size_t ctemp2 = (cbtemp + sizeof(ElemType) - 1) / sizeof(ElemType);
+        size_t ctemp = std::max(ctemp1, ctemp2);
+        cbtemp = ctemp * sizeof(ElemType);
+        // ElemType count needed to store indices, accounting for natural alignment for uint64_t type.
+        size_t cidx = ((celt + 1) * sizeof(uint64_t) - 1 + sizeof(ElemType) - 1) / sizeof(ElemType);
+        // Prepare temp workspace.
+        auto deviceId = m_computeDevice;
+        assert(m_workspace != nullptr);
+        auto workspace = m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
+        // Resize to store: output values for the 1st and 2nd passes, input indices, output indices, and temp storage.
+        workspace->Resize(m, 2 * n + (2 * cidx + ctemp + m - 1) / m);
+        outVal1 = workspace->m_pArray;
+        outVal2 = outVal1 + celt;
+        inIdx = reinterpret_cast<uint64_t*>(outVal2 + celt);
+        // Align indices pointer if needed.
+        size_t cbAlign = reinterpret_cast<size_t>(inIdx) % sizeof(uint64_t);
+        if (cbAlign != 0)
+            reinterpret_cast<uint8_t*&>(inIdx) += sizeof(uint64_t) - cbAlign;
+        outIdx = inIdx + celt;
+        void* ptmp = outIdx + celt;
+        assert(reinterpret_cast<ElemType*>(reinterpret_cast<uint8_t*>(ptmp) + cbtemp) <= workspace->m_pArray + workspace->GetNumElements());
+
+        // Initialize indices.
+        const int ThreadsPerBlock = 128;
+        int cblock = (celt + ThreadsPerBlock - 1) / ThreadsPerBlock;
+        _initIndicesForSort<<<cblock, ThreadsPerBlock, 0, t_stream>>>(inIdx, m, n);
+        // Sort by values.
+        CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(ptmp, cbtemp, inVal, outVal1, inIdx, outIdx, celt, 0, sizeof(ElemType) * 8, t_stream));
+        // Sort by column indices. outIdx contains indices after the first pass so it's used as an input.
+        CUDA_CALL(cub::DeviceRadixSort::SortPairs(ptmp, cbtemp, outIdx, inIdx, outVal1, outVal2, celt, 0, 32, t_stream));
+        // Copy results.
+        cblock = (topK * n + ThreadsPerBlock - 1) / ThreadsPerBlock;
+        _copyTopKResults<<<cblock, ThreadsPerBlock, 0, t_stream>>>(inIdx, outVal2, maxIndexes.m_pArray, maxValues.m_pArray, m, n, topK);
+
+        m_workspace->push(std::move(workspace));
+
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
    }
@ -2975,21 +3103,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, bool searchInCol)
    {
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            InvalidArgument("AssignNumOfDiff: a and b must have same dimension.");
+        if (a.GetNumCols() != b.GetNumCols())
+            InvalidArgument("AssignNumOfDiff: a and b must have the same number of columns.");
+        if (!searchInCol && a.GetNumRows() != b.GetNumRows())
+            InvalidArgument("AssignNumOfDiff: a and b must have the same number of rows.");

-        Resize(1,1); //result should be one element
+        Resize(1, 1); //result should be one element

        PrepareDevice();
        cudaEvent_t done = nullptr;
-        //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements());
-        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (CUDA_LONG)a.GetNumElements());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        if (!searchInCol)
+        {
+            //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
+            //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements());
+            _assignNumOfDiff<ElemType><<<1, 1024, 0, t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (CUDA_LONG)a.GetNumElements());
+        }
+        else
+        {
+            const int blockSize = 1024;
+            _assignNumOfDiffCol<blockSize><<<1, blockSize, 0, t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, 
+                static_cast<CUDA_LONG>(b.GetNumRows()), static_cast<CUDA_LONG>(a.GetNumCols()));
+        }
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
        return *this;
    }
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -88,6 +88,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static cublasHandle_t s_cuHandle[MaxGpus];
        static void *s_curandGenerator;

+        // Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows.
+        // REVIEW alexeyk: can be allocated lazily but the current footprint is small anyway.
+        mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>;
+
    private:
        void performInplaceFunction(int kind);
        size_t LocateElement (const size_t i, const size_t j) const;
@ -286,7 +290,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUMatrix<ElemType>&  AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        GPUMatrix<ElemType>&  AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        GPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
-        GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
+        //GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        GPUMatrix<ElemType>&  AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        GPUMatrix<ElemType>&  AddToRowRepeatValuesOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats);
@ -295,9 +299,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUMatrix<ElemType>&  AddFoldedPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);

        void VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMin(GPUMatrix<ElemType>& mainndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const;
+        void VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const;
+        void VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const;

-        GPUMatrix<ElemType>&   AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b); 
+        GPUMatrix<ElemType>&   AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, bool searchInCol = false); 


        GPUMatrix<ElemType>& AssignInnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b); 
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -12,6 +12,20 @@
 #include <cuda_runtime.h>
 #include "CommonMatrix.h"
 #include "device_functions.h"
+#include <assert.h>
+
+// REVIEW alexeyk: disable warnings properly for GCC/clang
+#ifdef _MSC_VER
+#pragma warning (push)
+#pragma warning (disable: 4100)
+#pragma warning (disable: 4127)
+#pragma warning (disable: 4201)
+#pragma warning (disable: 4515)
+#endif
+#include <cub/cub.cuh>
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif

 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
@ -4502,4 +4516,47 @@ __global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *
    //error[id] -= alpha * (label[id] - dnnoutput[id] );

 }
+
+template<class ElemType>
+__global__ void _copyTopKResults(const uint64_t* indexes, const ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
+    CUDA_LONG crow, CUDA_LONG ccol, int topK)
+{
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= topK * ccol)
+        return;
+    CUDA_LONG irow = id % topK;
+    CUDA_LONG icol = id / topK;
+    maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
+    maxValues[id] = values[icol * crow + irow];
+}
+
+template<int BlockSize, class ElemType>
+__global__ void _assignNumOfDiffCol(const ElemType *a, const ElemType *b, ElemType *c, CUDA_LONG crowB, CUDA_LONG ccol)
+{
+    assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
+
+    int cur = 0;
+    CUDA_LONG icol = threadIdx.x;
+    for (; icol < ccol; icol += blockDim.x)
+    {
+        ElemType key = a[icol];
+        CUDA_LONG idxB = icol * crowB;
+        CUDA_LONG irow = 0;
+        for (; irow < crowB; irow++, idxB++)
+        {
+            if (b[idxB] == key)
+                break;
+        }
+
+        cur += (irow == crowB);
+    }
+
+    using BlockReduceT = cub::BlockReduce<int, BlockSize>;
+    __shared__ typename BlockReduceT::TempStorage tmp;
+
+    int res = BlockReduceT(tmp).Sum(cur);
+    if (threadIdx.x == 0)
+        *c = res;
+}
+
 #endif // !CPUONLY
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -1737,6 +1737,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+#if 0   // no longer needed, and overkill anyway as it can be implemented as a bunch of calls to AssignRowSliceValuesOf()
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    Matrix<ElemType>& Matrix<ElemType>::AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
@ -1797,7 +1798,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return *this;
    } 
-
+#endif

    template<class ElemType>
    Matrix<ElemType>&  Matrix<ElemType>::AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
@ -2456,7 +2457,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    Matrix<ElemType>& Matrix<ElemType>::AssignNumOfDiff (const Matrix<ElemType>& a, const Matrix<ElemType>& b)
+    Matrix<ElemType>& Matrix<ElemType>::AssignNumOfDiff (const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol)
    {
        DecideAndMoveToRightDevice(a, b, *this);        
        //WARNING: a and b must have same type
@ -2467,8 +2468,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        DISPATCH_MATRIX_ON_FLAG(this,
            this,
-            this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix), 
+            this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), 
+            this->m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol), 
            NOT_IMPLEMENTED, 
            NOT_IMPLEMENTED
            );
@ -3390,13 +3391,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        DISPATCH_MATRIX_ON_FLAG(this,
            &maxValues,
-            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix,*maxValues.m_CPUMatrix,isColWise); maxIndexes.SetDataLocation(CPU, DENSE), 
-            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix,*maxValues.m_GPUMatrix,isColWise); maxIndexes.SetDataLocation(GPU, DENSE), 
-            NOT_IMPLEMENTED, 
+            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndexes.SetDataLocation(CPU, DENSE),
+            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndexes.SetDataLocation(GPU, DENSE),
+            NOT_IMPLEMENTED,
            NOT_IMPLEMENTED
            );
-               
-        }
+
+    }
+
+    template<class ElemType>
+    void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
+    {
+        if (IsEmpty())
+            LogicError("VectorMax: Matrix is empty.");
+
+        DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
+        maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
+        maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
+
+        DISPATCH_MATRIX_ON_FLAG(this,
+            &maxValues,
+            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE),
+            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }

    template<class ElemType>
    void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -75,7 +75,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        mutable size_t m_numTimesDeviceChanged;
        mutable size_t m_numTimesMatrixTypeChanged;
-        mutable int m_devicesTransferedTo[2];
+        mutable int m_devicesTransferedTo[2];       // TODO: what is this for? Seems only diagnostics
            
        //Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
        void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved=true,bool emptyTransfer=false) const; 
@ -332,7 +332,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
-        Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
+        //Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        Matrix<ElemType>&  AddToRowRepeatValuesOf(const Matrix<ElemType>& a, const size_t numRepeats);
@ -368,9 +368,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
+        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
+        void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;

-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
+        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 

        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first

--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -505,7 +505,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //for each column of a, we add all rows of a to this starting from startIndex
    template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
    template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
-    template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) { return *this; }
+    //template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) { return *this; }

    template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& /*a*/, const size_t numRowRepeats, const size_t numColRepeats) { return *this; }
    template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowRepeatValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t numRowRepeats) { return *this; }
@ -863,10 +863,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType> void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const
    {}

+    template<class ElemType> void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK, GPUMatrix<ElemType>& workspace) const
+    {}
+
    template<class ElemType> void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const
    {}

-    template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& /*a*/, const GPUMatrix<ElemType>& /*b*/) { return *this; }
+    template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& /*a*/, const GPUMatrix<ElemType>& /*b*/, bool /*searchInCol = false*/) { return *this; }

 #pragma endregion Member BLAS Functions    

--- a/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test
+++ b/Tests/ParallelTraining/NoQuantization/DoublePrecision/run-test
@ -1,41 +1,18 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/../..
+LogFileName=stderr
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+if cntkmpirun "-n 4" SimpleMultiGPU.config "precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]" ; then
+  ExitCode=$?
+  sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0
+  sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1
+  sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank2
+  sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank3
+  exit $ExitCode
 else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
+  exit $?
 fi
-
-configFile=$TEST_DIR/../../SimpleMultiGPU.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-
-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
-
-# Since we use the MS MPI program on Windows, the CNTK binary path argument
-# passed to mpiexec must be in the windows format
-CNTKBinaryPath=$TEST_CNTK_BINARY
-if [ "$OS" == "Windows_NT" ]; then
-  CNTKBinaryPath=$(cygpath -aw $CNTKBinaryPath)
-fi
-
-MPI_ARGS="-n 4"
-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID stderr=$RunDir/stderr precision=double SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=64]]]]"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running "$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-"$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-ExitCode=$?
-sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank0
-sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank1
-sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank2
-sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank3
-exit $ExitCode
--- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/run-test
@ -1,41 +1,18 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/../..
+LogFileName=stderr
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+if cntkmpirun "-n 4" SimpleMultiGPU.config "precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]" ; then
+  ExitCode=$?
+  sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank0
+  sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank1
+  sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank2
+  sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_SimpleMultiGPU.logrank3
+  exit $ExitCode
 else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
+  exit $?
 fi
-
-configFile=$TEST_DIR/../../SimpleMultiGPU.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-
-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
-
-# Since we use the MS MPI program on Windows, the CNTK binary path argument
-# passed to mpiexec must be in the windows format
-CNTKBinaryPath=$TEST_CNTK_BINARY
-if [ "$OS" == "Windows_NT" ]; then
-  CNTKBinaryPath=$(cygpath -aw $CNTKBinaryPath)
-fi
-
-MPI_ARGS="-n 4"
-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID stderr=$RunDir/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running "$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-"$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-ExitCode=$?
-sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank0
-sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank1
-sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank2
-sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/stderr_SimpleMultiGPU.logrank3
-exit $ExitCode
--- a/Tests/Speech/DNN/DiscriminativePreTraining/run-test
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/run-test
@ -1,27 +1,8 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
-else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
-fi

-configFile=$TEST_DIR/cntk_dpt.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
+#!/bin/bash

-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
+. $TEST_ROOT_DIR/run-test-common

-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running $TEST_CNTK_BINARY $CNTK_ARGS
-$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk_dpt.config || exit $?
--- a/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml
@ -3,7 +3,11 @@ tags:
     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
     - bvt-s  (flavor=='debug') ^ (device=='cpu')
     # running unconditionally on every Nightly job in 'S' leg
-     - nightly-s
+     # TODO: Temporary disabling Release-GPU because of a known bug causing large variance between
+     # Release and Debug configurations for GPU only for this (Speech/DNN/DiscriminativePreTraining) test. 
+     # This will be re-enabled after the bug has been addressed. 
+     # DO NOT COPY this disablement for other tests!!
+     - nightly-s  (flavor!='release') or (device!='gpu')

 testCases:
  CNTK Run must be completed:
--- a/Tests/Speech/DNN/Parallel1BitQuantization/run-test
+++ b/Tests/Speech/DNN/Parallel1BitQuantization/run-test
@ -1,40 +1,17 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/..
+LogFileName=stderr
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+if cntkmpirun "-n 3" cntk.config "precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]" ; then
+  ExitCode=$?
+  sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
+  sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
+  sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank2
+  exit $ExitCode
 else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
+  exit $?
 fi
-
-configFile=$TEST_DIR/../cntk.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-
-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
-
-# Since we use the MS MPI program on Windows, the CNTK binary path argument
-# passed to mpiexec must be in the windows format
-CNTKBinaryPath=$TEST_CNTK_BINARY
-if [ "$OS" == "Windows_NT" ]; then
-  CNTKBinaryPath=$(cygpath -aw $CNTKBinaryPath)
-fi
-
-MPI_ARGS="-n 3"
-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID stderr=$RunDir/stderr precision=double speechTrain=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=1]]]] speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running "$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-"$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-ExitCode=$?
-sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/stderr_speechTrain.logrank0
-sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/stderr_speechTrain.logrank1
-sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/stderr_speechTrain.logrank2
-exit $ExitCode
--- a/Tests/Speech/DNN/ParallelNoQuantization/run-test
+++ b/Tests/Speech/DNN/ParallelNoQuantization/run-test
@ -1,40 +1,17 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/..
+LogFileName=stderr
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+if cntkmpirun "-n 3" cntk.config ; then
+  ExitCode=$?
+  sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
+  sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
+  sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank2
+  exit $ExitCode
 else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
+  exit $?
 fi
-
-configFile=$TEST_DIR/../cntk.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-
-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
-
-# Since we use the MS MPI program on Windows, the CNTK binary path argument
-# passed to mpiexec must be in the windows format
-CNTKBinaryPath=$TEST_CNTK_BINARY
-if [ "$OS" == "Windows_NT" ]; then
-  CNTKBinaryPath=$(cygpath -aw $CNTKBinaryPath)
-fi
-
-MPI_ARGS="-n 3"
-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID stderr=$RunDir/stderr"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running "$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-"$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS
-ExitCode=$?
-sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/stderr_speechTrain.logrank0
-sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/stderr_speechTrain.logrank1
-sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/stderr_speechTrain.logrank2
-exit $ExitCode
--- a/Tests/Speech/Data/ndl/dnn_1layer.txt
+++ b/Tests/Speech/Data/ndl/dnn_1layer.txt
@ -19,9 +19,9 @@ DNN=[
    features=Input(featDim, tag=feature)
    labels=Input(LabelDim, tag=label)

-    GlobalMean=Parameter(featDim, init=fromFile, initFromFilePath=$GlobalMean$, computeGradient=false) 
+    GlobalMean=Parameter(featDim,   init=fromFile, initFromFilePath=$GlobalMean$,   computeGradient=false) 
    GlobalInvStd=Parameter(featDim, init=fromFile, initFromFilePath=$GlobalInvStd$, computeGradient=false)
-    GlobalPrior=Parameter(LabelDim, init=fromFile, initFromFilePath=$GlobalPrior$, computeGradient=false)
+    GlobalPrior=Parameter(LabelDim, init=fromFile, initFromFilePath=$GlobalPrior$,  computeGradient=false)
    logPrior=Log(GlobalPrior)

    # define network
--- a/Tests/Speech/Data/ndl/macros.txt
+++ b/Tests/Speech/Data/ndl/macros.txt
@ -1,7 +1,9 @@
 DNNLayer(inDim, outDim, x)
 {
-    W = Parameter(outDim, inDim); 
-    b = Parameter(outDim); 
+    #W = Parameter(outDim, inDim, init=uniform, initValueScale=1, initOnCPUOnly=true, randomSeed=1); # randomizing on CPU with fixed seed to get reproducable results across configurations
+    #b = Parameter(outDim,        init=uniform, initValueScale=1, initOnCPUOnly=true, randomSeed=1);
+    W = Parameter(outDim, inDim);
+    b = Parameter(outDim);
    t = Times(W, x);
    z = Plus(t, b);
    y = sigmoid(z);
@ -9,6 +11,8 @@ DNNLayer(inDim, outDim, x)

 DNNLastLayer(hiddenDim, LabelDim, x)
 {
+    #W = Parameter(LabelDim, hiddenDim, init=uniform, initValueScale=1, initOnCPUOnly=true, randomSeed=1);
+    #b = Parameter(LabelDim,            init=uniform, initValueScale=1, initOnCPUOnly=true, randomSeed=1);
    W = Parameter(LabelDim, hiddenDim);
    b = Parameter(LabelDim);
    t = Times(W, x);
--- a/Tests/Speech/LSTM/FullUtterance/run-test
+++ b/Tests/Speech/LSTM/FullUtterance/run-test
@ -1,29 +1,8 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
-else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
-fi

-configFile=$TEST_DIR/../cntk.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-NDLDir=$TEST_DIR/..
+. $TEST_ROOT_DIR/run-test-common

-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-  NDLDir=$(cygpath -aw $NDLDir)
-fi
+ConfigDir=$TEST_DIR/..

-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID NDLDir=$NDLDir Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running $TEST_CNTK_BINARY $CNTK_ARGS
-$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk.config 'Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]' || exit $?
--- a/Tests/Speech/LSTM/Truncated/run-test
+++ b/Tests/Speech/LSTM/Truncated/run-test
@ -1,29 +1,8 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
-else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
-fi

-configFile=$TEST_DIR/../cntk.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
-NDLDir=$TEST_DIR/..
+. $TEST_ROOT_DIR/run-test-common

-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-  NDLDir=$(cygpath -aw $NDLDir)
-fi
+ConfigDir=$TEST_DIR/..

-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID NDLDir=$NDLDir"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running $TEST_CNTK_BINARY $CNTK_ARGS
-$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk.config || exit $?
--- a/Tests/Speech/LSTM/cntk.config
+++ b/Tests/Speech/LSTM/cntk.config
@ -14,7 +14,7 @@ speechTrain=[
    traceLevel=1
    
    #NDLNetworkBuilder=[
-    #    networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl
+    #    networkDescription=$ConfigDir$/lstmp-3layer_WithSelfStab.ndl
    #]    
    
    SGD=[
--- a/Tests/Speech/QuickE2E/run-test
+++ b/Tests/Speech/QuickE2E/run-test
@ -1,31 +1,12 @@
 #!/bin/bash
-if [ "$TEST_DEVICE" == "cpu" ]; then
-  CNTK_DEVICE_ID=-1
-elif [ "$TEST_DEVICE" == "gpu" ]; then
-  CNTK_DEVICE_ID=0
-else
-  echo "Error: Unknown TEST_DEVICE specified!"
-  exit 3
-fi

-configFile=$TEST_DIR/cntk.config
-RunDir=$TEST_RUN_DIR
-DataDir=$TEST_DATA_DIR
+. $TEST_ROOT_DIR/run-test-common

-if [ "$OS" == "Windows_NT" ]; then
-  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
-  configFile=$(cygpath -aw $configFile)
-  RunDir=$(cygpath -aw $RunDir)
-  DataDir=$(cygpath -aw $DataDir)
-fi
-
-CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID"
-MODELS_DIR=$TEST_RUN_DIR/models
-[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
-mkdir -p $MODELS_DIR || exit $?
-echo === Running $TEST_CNTK_BINARY $CNTK_ARGS
-$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk.config || exit $?
 echo === Deleting last epoch data
 rm $TEST_RUN_DIR/models/*.dnn
 echo ==== Re-running from checkpoint
-$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
+DeleteExistingModels=0
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk.config || exit $?
--- a/Tests/Speech/README.txt
+++ b/Tests/Speech/README.txt
@ -1,38 +1,32 @@
 How to run the Tests\Speech test
 ================================

-Full test
---------
-
-Install Cygwin with the python module.
-
-Execute 'Tests/Testdriver.py run' script. This will run the test in various Tests (recursively). Note that the first time you may get an error about the missing YAML python module that you will need to install. 
-
 Command lines for debugging
 ---------------------------

 Note: Below, the current dir is set to be the data directory. This allows for local paths in data scripts.

--- QuickE2E:
+--- Speech\QuickE2E:

 COMMAND:     currentDirectory=$(SolutionDir)Tests\Speech\Data  configFile=$(SolutionDir)Tests\Speech\QuickE2E\cntk.config  stderr=$(SolutionDir)Tests\Speech\RunDir\QuickE2E\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\Speech\RunDir\QuickE2E  DataDir=.  DeviceId=Auto

 Linux:
 bin/cntk configFile=Tests/Speech/QuickE2E/cntk.config RunDir=Tests/Speech/RunDirL/QuickE2E DataDir=Tests/Speech/Data DeviceId=0

-# TODO: can stderr refer to RunDir?
-
--- LSTM\Truncated:
+--- Speech\LSTM\Truncated:

 COMMAND:     currentDirectory=$(SolutionDir)Tests\Speech\Data  configFile=$(SolutionDir)Tests\Speech\LSTM\cntk.config  stderr=$(SolutionDir)Tests\Speech\RunDir\LSTM\Truncated\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\Speech\RunDir\LSTM\Truncated  NdlDir=$(SolutionDir)Tests\Speech\LSTM  DataDir=.  DeviceId=Auto

--- LSTM\FullUtterance:
+--- Speech\LSTM\FullUtterance:

 COMMAND:     currentDirectory=$(SolutionDir)Tests\Speech\Data  configFile=$(SolutionDir)Tests\Speech\LSTM\cntk.config  stderr=$(SolutionDir)Tests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\Speech\RunDir\LSTM\FullUtterance  NdlDir=$(SolutionDir)Tests\Speech\LSTM  DataDir=.  DeviceId=Auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]]  speechTrain=[SGD=[numMBsToShowResult=1]]

+--- Speech\DiscriminativePreTraining:
+
+COMMAND:     currentDirectory=$(SolutionDir)Tests\Speech\Data  configFile=..\DNN\DiscriminativePreTraining\cntk_dpt.config  stderr=..\RunDir\DNN\DiscriminativePreTraining\models\cntkSpeech.dnn.log  RunDir=..\RunDir\DNN\DiscriminativePreTraining  DataDir=.  DeviceId=Auto
+
 --- MNIST:

-WORKING DIR: 
 COMMAND:     currentDirectory=$(SolutionDir)ExampleSetups\Image\MNIST  configFile=02_Conv.config configName=02_Conv


--- a/Tests/TestDriver.py
+++ b/Tests/TestDriver.py
@ -262,6 +262,10 @@ class Test:
        if args.verbose:
          print self.fullName + ": " + line

+        if args.dry_run:
+          print line
+          continue
+
        print >>output, line
        allLines.append(line)
        output.flush()
@ -271,13 +275,17 @@ class Test:
    exitCode = process.wait()
    success = True

-    # checking exit code
-    if exitCode != 0:
-      return TestRunResult.fatalError("Exit code must be 0", "==> got exit code {0} when running: {1}".format(exitCode, " ".join(cmdLine)), logFile = logFile)
-
    # saving log file path, so it can be reported later
    result.logFile = logFile

+    # checking exit code
+    if exitCode != 0:
+      if args.dry_run:
+        print "[SKIPPED]"
+        return result
+      else:
+        return TestRunResult.fatalError("Exit code must be 0", "==> got exit code {0} when running: {1}".format(exitCode, " ".join(cmdLine)), logFile = logFile)
+
    # finalizing verification - need to check whether we have any unmatched lines
    for testCaseRunResult in result.testCaseRunResults:
      testCaseRunResult.testCase.finalize(testCaseRunResult)
@ -546,6 +554,8 @@ def runCommand(args):
  devices = args.devices
  flavors = args.flavors

+  os.environ["TEST_ROOT_DIR"] = os.path.dirname(os.path.realpath(sys.argv[0]))
+
  print "CNTK Test Driver is started"
  print "Running tests:  ", " ".join([y.fullName for y in testsToRun])
  print "Build location: ", args.build_location
@ -555,6 +565,8 @@ def runCommand(args):
  if (args.update_baseline):
    print "*** Running in automatic baseline update mode ***"
  print ""
+  if args.dry_run:
+    os.environ["DRY_RUN"] = "1"
  succeededCount, totalCount = 0, 0
  for test in testsToRun:
    for flavor in flavors:
@ -569,7 +581,6 @@ def runCommand(args):
        sys.stdout.write("Running test {0} ({1} {2}) - ".format(test.fullName, flavor, device));
        if args.dry_run:
           print "[SKIPPED] (dry-run)"
-           continue
        # in verbose mode, terminate the line, since there will be a lot of output
        if args.verbose:
          sys.stdout.write("\n");
@ -629,7 +640,7 @@ defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%
 runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp")
 runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them")
 runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script")
-runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and condfigurations to be run")
+runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and configurations to be run along with full command lines")

 runSubparser.set_defaults(func=runCommand)

--- a/Tests/run-test-common
+++ b/Tests/run-test-common
@ -0,0 +1,106 @@
+#!/bin/bash
+
+# Helper script containing common code used by run-test scripts of E2E tests
+
+BinaryPath=$TEST_CNTK_BINARY
+
+if [ "$TEST_DEVICE" == "cpu" ]; then
+  CNTKDeviceId=-1
+elif [ "$TEST_DEVICE" == "gpu" ]; then
+  CNTKDeviceId=0
+else
+  echo "Error: Unknown TEST_DEVICE specified!"
+  exit 3
+fi
+
+LogFileName=
+
+ConfigDir=$TEST_DIR
+RunDir=$TEST_RUN_DIR
+DataDir=$TEST_DATA_DIR
+
+MPIMode=0
+MPIArgs=
+
+DeleteExistingModels=1
+
+# Helper function to print and run a command
+run()
+{
+  cmd=$1
+  shift
+  if [ "$DRY_RUN" == "1" ]; then
+    workingDir=$PWD
+    if [ "$OS" == "Windows_NT" ]; then
+      workingDir=$(cygpath -aw $workingDir)
+      if [[ $MPIMode == 0 ]]; then
+        cmd=$(cygpath -aw $cmd)
+        TEST_ROOT_DIR_ESCAPED=`echo -n $(cygpath -aw $TEST_ROOT_DIR) | sed 's/\\\\/\\\\\\\\/g'`
+        workingDir=`echo "$workingDir" | sed "s/$TEST_ROOT_DIR_ESCAPED/\\$\\(SolutionDir\\)\\\\\\\\Tests/g"`
+      fi
+    fi
+    echo Working Directory: $workingDir
+    echo Full command: "$cmd" "$@"
+    if [ "$OS" == "Windows_NT" ]; then
+      if [[ $MPIMode == 0 ]]; then
+        echo VS debugging command args: "$@" | sed "s/$TEST_ROOT_DIR_ESCAPED/\\$\\(SolutionDir\\)\\\\Tests/g"
+      fi
+    fi
+    return 1
+  else
+    echo === Running "$cmd" "$@"
+    "$cmd" "$@"
+    return $?
+  fi
+}
+
+# Function for launching the CNTK executable
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun()
+{
+  configFileName=$1
+  additionalCNTKArgs=$2
+
+  if [ "$OS" == "Windows_NT" ]; then
+    # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
+    ConfigDir=$(cygpath -aw $ConfigDir)
+    RunDir=$(cygpath -aw $RunDir)
+    DataDir=$(cygpath -aw $DataDir)
+  fi
+
+  CNTKArgs="configFile=$ConfigDir/$configFileName RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTKDeviceId $additionalCNTKArgs"
+  if [ "$LogFileName" != "" ]; then
+    CNTKArgs="$CNTKArgs stderr=$RunDir/$LogFileName"
+  fi
+
+  modelsDir=$TEST_RUN_DIR/models
+  if [[ $DeleteExistingModels == 1 ]]; then
+    [ -d $modelsDir ] && rm -rf $modelsDir
+  fi
+  mkdir -p $modelsDir || exit $?
+
+  if [[ $MPIMode == 0 ]]; then
+    run "$BinaryPath" $CNTKArgs
+  else
+    run "$MPI_BINARY" $MPIArgs $BinaryPath $CNTKArgs
+  fi
+
+  return $?
+}
+
+# Function for launching a parallel CNTK run with MPI
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+cntkmpirun()
+{
+  # Since we use the MS MPI program on Windows, the CNTK binary path argument
+  # passed to mpiexec must be in the windows format
+  if [ "$OS" == "Windows_NT" ]; then
+    BinaryPath=$(cygpath -aw $BinaryPath)
+  fi
+
+  MPIMode=1
+  MPIArgs=$1
+
+  cntkrun "$2" "$3"
+  return $?
+}
--- a/81
+++ b/81
@ -28,6 +28,13 @@ have_gdk=no
 gdk_path=
 gdk_check=include/nvidia/gdk/nvml.h

+have_cub=no
+cub_path=
+cub_check=cub/cub.cuh
+
+have_opencv=no
+opencv_path=
+opencv_check=include/opencv2/opencv.hpp
 mathlib=

 # List from best to worst choice
@ -41,6 +48,8 @@ default_mkls=""
 default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
 default_kaldis="kaldi-trunk"
 default_gdks=". gdk/usr"
+default_cubs="cub-1.4.1"
+default_opencvs="opencv-3.0.0"

 function default_paths ()
 {
@ -101,6 +110,15 @@ function find_gdk ()
    find_dir "$default_gdks" "$gdk_check"
 }

+function find_cub ()
+{
+    find_dir "$default_cubs" "$cub_check"
+}
+
+function find_opencv ()
+{
+    find_dir "$default_opencvs" "$opencv_check"
+}
 function is_hardlinked ()
 {
    r=no
@ -142,11 +160,13 @@ function show_help ()
    echo "  --add directory add directory to library search path"
    echo "  --cuda[=(yes|no)] use cuda GPU $(show_default $(default_use_cuda))"
    echo "  --with-cuda[=directory] $(show_default $(find_cuda))"
+    echo "  --with-cub[=directory] $(show_default $(find_cub))"
    echo "  --with-gdk[=directory] $(show_default $(find_gdk))"
    echo "  --with-acml[=directory] $(show_default $(find_acml))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
    echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
    echo "  --with-kaldi[=directory] $(show_default $(find_kaldi))"
+    echo "  --with-opencv[=directory] $(show_default $(find_opencv))"
    echo "Libraries search path:"
    for head in $(default_paths)
    do
@ -215,6 +235,28 @@ do
                fi
            fi
            ;;
+        --with-cub*)
+            have_cub=yes
+            if test x$optarg = x
+            then
+                cub_path=$(find_cub)
+                if test x$cub_path = x
+                then
+                    echo "Cannot find NVIDIA CUB directory."
+                    echo "Please specify a value for --with-cub"
+                    echo "NVIDIA CUB can be downloaded from https://github.com/NVlabs/cub/archive/1.4.1.zip, extract the archive to /usr/local"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $cub_check) = yes
+                then
+                    cub_path=$optarg
+                else
+                    echo "Invalid CUB directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
        --with-gdk*)
            have_gdk=yes
            if test x$optarg = x
@ -313,6 +355,28 @@ do
                fi
            fi
            ;;
+        --with-opencv*)
+            have_opencv=yes
+            if test x$optarg = x
+            then
+                opencv_path=$(find_opencv)
+                if test x$opencv_path = x
+                then
+                    echo "Cannot find OpenCV directory."
+                    echo "Please specify a value for --with-opencv"
+                    echo "OpenCV can be downloaded from http://opencv.org/downloads.html, install instructions http://docs.opencv.org/doc/tutorials/introduction/linux_install/linux_install.html#linux-installation"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $opencv_check) = yes
+                then
+                    opencv_path=$optarg
+                else
+                    echo "Invalid OpenCV directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
        *)
            echo Invalid option $key
            show_help
@ -372,6 +436,18 @@ then
    fi
 fi

+if test $enable_cuda = yes && test x$cub_path = x
+then
+    cub_path=$(find_cub)
+    if test x$cub_path = x ; then
+        echo Cannot locate NVIDIA CUB directory
+        echo GPU will be disabled
+        echo NVIDIA CUB can be downloaded from https://github.com/NVlabs/cub/archive/1.4.1.zip, extract the archive to /usr/local
+        enable_cuda=no
+    else
+        echo Found CUB at $cub_path
+    fi
+fi
 config=$build_top/Config.make
 echo Generating $config
 echo "#Configuration file for cntk" > $config
@ -388,10 +464,15 @@ esac
 if test $enable_cuda = yes ; then
    echo CUDA_PATH=$cuda_path >> $config
    echo GDK_PATH=$gdk_path >> $config
+    echo CUB_PATH=$cub_path >> $config
 fi
 if test x$kaldi_path != x ; then
    echo KALDI_PATH=$kaldi_path >> $config
 fi
+if test x$opencv_path != x ; then
+    echo Found OpenCV at $opencv_path
+    echo OPENCV_PATH=$opencv_path >> $config
+fi

 # If we are not in the configure directory, generate a trampoline Makefile
 makefile=$build_top/Makefile