diff --git a/.gitignore b/.gitignore
index 00a6a56c7..c08ca3570 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,162 +1,168 @@
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.sln.docstates
-*.orig
-
-# Build results
-
-[Dd]ebug/
-[Rr]elease/
-x64/
-build/
-[Bb]in/
-[Oo]bj/
-
-# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
-!packages/*/build/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-*_i.c
-*_p.c
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.log
-*.scc
-*.dep
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opensdf
-*.sdf
-*.cachefile
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-*.ncrunch*
-.*crunch*.local.xml
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.Publish.xml
-
-# NuGet Packages Directory
-## TODO: If you have NuGet Package Restore enabled, uncomment the next line
-#packages/
-
-# Windows Azure Build Output
-csx
-*.build.csdef
-
-# Windows Store app package directory
-AppPackages/
-
-# Others
-sql/
-*.Cache
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.[Pp]ublish.xml
-*.pfx
-*.publishsettings
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file to a newer
-# Visual Studio version. Backup files are not needed, because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-App_Data/*.mdf
-App_Data/*.ldf
-
-
-#LightSwitch generated files
-GeneratedArtifacts/
-_Pvt_Extensions/
-ModelManifest.xml
-
-# =========================
-# Windows detritus
-# =========================
-
-# Windows image file caches
-Thumbs.db
-ehthumbs.db
-
-# Folder config file
-Desktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Mac desktop service store files
-.DS_Store
-
-*.lyx~
-*.bak
-*.lyx#
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+*.orig
+
+# Build results
+
+[Dd]ebug/
+[Rr]elease/
+x64/
+build/
+[Bb]in/
+[Oo]bj/
+
+# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
+!packages/*/build/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.log
+*.scc
+*.dep
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+*.ncrunch*
+.*crunch*.local.xml
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.Publish.xml
+
+# NuGet Packages Directory
+## TODO: If you have NuGet Package Restore enabled, uncomment the next line
+#packages/
+
+# Windows Azure Build Output
+csx
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Others
+sql/
+*.Cache
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.[Pp]ublish.xml
+*.pfx
+*.publishsettings
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+App_Data/*.mdf
+App_Data/*.ldf
+
+
+#LightSwitch generated files
+GeneratedArtifacts/
+_Pvt_Extensions/
+ModelManifest.xml
+
+# =========================
+# Windows detritus
+# =========================
+
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Mac desktop service store files
+.DS_Store
+
+*.lyx~
+*.bak
+*.lyx#
+
+# =========================
+# prebuild file 
+# =========================
+MachineLearning/cn/buildinfo.h
+
diff --git a/Common/ConfigFile.cpp b/Common/ConfigFile.cpp
index f3902df4b..f5eb505a3 100644
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@@ -1,279 +1,280 @@
-//
-// <copyright file="ConfigFile.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// ConfigFile.cpp : Defines the configuration file loader.
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "File.h"
-#include "commandArgUtil.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-
-    // ParseCommandLine - parse the command line parameters
-    // argc - count of arguments
-    // argv - array of argument parameters
-    // config - config to return
-    std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
-    {
-        config.SetName(std::string("global"));
-        
-        // This vector keeps track of the config files we have already read
-        std::vector<std::string> resolvedConfigFiles;
-        std::string configString;
-        
-        // start at 1, because 0 is the name of the EXE
-        for (int i=1; i < argc; ++i)
-        {
-            wstring str = argv[i];
-
-            // see if they are loading a config file
-            wstring configDescriptor = L"configFile=";
-            int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
-
-            // no config file, parse as regular argument
-            if (compare)
-            {
-                configString += (msra::strfun::utf8(str) + "\n");
-            }
-            else // One or more config file paths specified in a "+"-separated list.
-            {
-                const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
-                std::vector<std::string> filePathsVec = msra::strfun::split(filePaths, "+");
-                for (auto filePath : filePathsVec)
-                {
-                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
-                    {
-                        // if haven't already read this file, read it
-                        resolvedConfigFiles.push_back(filePath);
-                        configString += config.ReadConfigFile(filePath);
-                    }
-                    else
-                       RuntimeError("Cannot specify same config file multiple times at the command line.");
-                }
-            }
-        }
-        
-        configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
-        config.FileParse(configString);
-        return configString;
-    }
-
-    // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
-    //     form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
-    //     If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
-    //     and includes the body of each file in the string which is eventually returned by this function.  If the included
-    //     config file includes other config files, this function will recursively include those files as well.
-    // configString - the config string within which to look for "include" statements
-    // resolvedConfigFiles - the paths to all the config files that have already been resolved.  This vector is used to prevent include loops,
-    //     and to prevent files from being included multiple times.
-    // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
-    std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector<std::string> &resolvedConfigFiles)
-    {
-        std::vector<std::string> lines = msra::strfun::split(configString, "\n");
-        std::string includeKeyword = "include=";
-        std::size_t includeKeywordSize = includeKeyword.size();
-        std::string newConfigString;
-        for (std::string line : lines)
-        {
-            if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
-            {
-                std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
-                if (filePaths.find(openBraceVar) != std::string::npos)
-                {
-                    RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
-                }
-
-                std::vector<std::string> filePathVec = msra::strfun::split (filePaths, "+");
-                for (auto filePath : filePathVec)
-                {
-                    // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
-                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
-                    {
-                        // Recursively resolve the include statements in the included config files.
-                        // Ensure that the same config file isn't included twice, by keeping track of the config
-                        // files that have already been resolved in the resolvedPaths vector.
-                        resolvedConfigFiles.push_back(filePath);
-                        newConfigString += ResolveIncludeStatements(
-                                               ReadConfigFile(filePath), 
-                                               resolvedConfigFiles
-                                           );
-                    }
-                    else
-                    {
-                        // We already resolved this path.  Write a warning so that user is aware of this.
-                        // TODO: This message is written to stderr before stderr gets redirected to the specified file.  Fix this.
-                        fprintf(stderr, "Warning: Config file included multiple times.  Not including config file again: %s", filePath.c_str());
-                    }
-                }
-            }
-            else
-            {
-                newConfigString += (line + "\n");
-            }
-        }
-        return newConfigString;
-    }
-
-    // LoadConfigFiles - load multiple configuration file, and adds to config parameters
-    // filePaths - A "+" delimited list of file paths, corresponding to config files to load
-    // configStringToAppend - A config string which should be processed together with the config files
-    void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
-    {
-        std::string configString = ReadConfigFiles(filePaths);
-        if(configStringToAppend != nullptr)
-        {
-            configString += *configStringToAppend;
-        }
-
-        FileParse(configString);
-    }
-
-    // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
-    //     If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
-    //     then this method will resolve those variables.  This method is meant for the processing of NDL/MEL config files,
-    //     in order to allow them to access variables defined in the primary config file via $varName$ syntax.
-    // filePath - filePath to the file to load
-    // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
-    void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
-    {
-        // read file, resolve variables, and then parse.
-        std::string fileContents = ReadConfigFile(filePath);
-        fileContents = config.ResolveVariables(fileContents);
-        FileParse(fileContents);
-    }
-
-    // LoadConfigFile - load a configuration file, and add to config parameters
-    // filePath - filePath to the file to read
-    void ConfigParser::LoadConfigFile(const std::wstring &filePath)
-    {
-        // read and then parse
-        FileParse(ReadConfigFile(filePath));
-    }
-    
-    // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
-    std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
-    {
-        return ReadConfigFiles(msra::strfun::utf16(filePaths));
-    }
-
-    // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
-    // filePaths - A "+" delimited list of file paths, corresponding to config files to read
-    // returns: a string with the concatentated file contents
-    std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
-    {
-        std::string configString;
-        std::vector<std::wstring> filePathVec = msra::strfun::split (filePaths, L"+");
-        for (auto filePath : filePathVec)
-        {
-            configString += ReadConfigFile(filePath);
-        }
-        return configString;
-    }
-
-    // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
-    std::string ConfigParser::ReadConfigFile(const std::string &filePath)
-    {
-        return ReadConfigFile(msra::strfun::utf16(filePath));
-    }
-
-    // ReadConfigFile - read a configuration file, and return as a string
-    // filePath - the path to the config file to read
-    // returns: a string with the concatentated file contents
-    std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
-    {
-        File file(filePath, fileOptionsRead);
-
-        // initialize with file name
-        std::string path = msra::strfun::utf8(filePath);
-        auto location = path.find_last_of("/\\");
-        if (location != npos)
-            path = path.substr(location+1);
-        m_configName = move(path);
-
-        // read the entire file into a string
-        // CONSIDER: should the File API support this, instead of line by line?
-        size_t fileLength = file.Size();
-        string str;
-        string configFile;
-        configFile.reserve(fileLength);
-        while (!file.IsEOF())
-        {
-            file.GetLine(str);
-            str = PreprocessConfigLine(str);
-            if (str != "")
-            {
-                configFile.append(str);
-                configFile.append("\n");
-            }
-        }
-        return configFile;
-    }
-
-    // GetFileConfigNames - determine the names of the features and labels sections in the config file
-    // features - [in,out] a vector of feature name strings
-    // labels - [in,out] a vector of label name strings
-    void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels)
-    {
-        for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
-        {
-            auto pair = *iter;
-            ConfigParameters temp (iter->second);
-            // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
-            if (temp.ExistsCurrent("dim"))
-            {
-                if (temp.ExistsCurrent("labelMappingFile") 
-                    || temp.ExistsCurrent("labelDim")
-                    || temp.ExistsCurrent("labelType")
-                    || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
-                {
-                    labels.push_back(msra::strfun::utf16(iter->first));
-                }
-                else
-                {
-                    features.push_back(msra::strfun::utf16(iter->first));
-                }
-            }
-        }
-    }
-
-    // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
-    // config - configuration to search
-    // key - string we ar searching for in each config section
-    // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
-    void FindConfigNames(const ConfigParameters& config, std::string key, std::vector<std::wstring>& names)
-    {
-        for (auto iter = config.begin(); iter != config.end(); ++iter)
-        {
-            auto pair = *iter;
-            ConfigParameters temp (iter->second);
-            // see if we have a config parameters that contains a "key" element, if so use it
-            if (temp.ExistsCurrent(key))
-            {
-                names.push_back(msra::strfun::utf16(iter->first));
-            }
-        }
-    }
-
-    // Trim - trim white space off the start and end of the string
-    // str - string to trim
-    // NOTE: if the entire string is empty, then the string will be set to an empty string
-    void Trim(std::string& str)
-    {
-        auto found = str.find_first_not_of(" \t");
-        if (found == npos)
-        {
-            str.erase(0);
-            return;
-        }
-        str.erase(0, found);
-        found = str.find_last_not_of(" \t");
-        if (found != npos)
-            str.erase(found+1);
-    }
-
+//
+// <copyright file="ConfigFile.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// ConfigFile.cpp : Defines the configuration file loader.
+//
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
+#include "File.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+    // ParseCommandLine - parse the command line parameters
+    // argc - count of arguments
+    // argv - array of argument parameters
+    // config - config to return
+    std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
+    {
+        config.SetName(std::string("global"));
+        
+        // This vector keeps track of the config files we have already read
+        std::vector<std::string> resolvedConfigFiles;
+        std::string configString;
+        
+        // start at 1, because 0 is the name of the EXE
+        for (int i=1; i < argc; ++i)
+        {
+            wstring str = argv[i];
+
+            // see if they are loading a config file
+            wstring configDescriptor = L"configFile=";
+            int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
+
+            // no config file, parse as regular argument
+            if (compare)
+            {
+                configString += (msra::strfun::utf8(str) + "\n");
+            }
+            else // One or more config file paths specified in a "+"-separated list.
+            {
+                const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
+                std::vector<std::string> filePathsVec = msra::strfun::split(filePaths, "+");
+                for (auto filePath : filePathsVec)
+                {
+                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+                    {
+                        // if haven't already read this file, read it
+                        resolvedConfigFiles.push_back(filePath);
+                        configString += config.ReadConfigFile(filePath);
+                    }
+                    else
+                       RuntimeError("Cannot specify same config file multiple times at the command line.");
+                }
+            }
+        }
+        
+        configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
+        config.FileParse(configString);
+        return configString;
+    }
+
+    // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
+    //     form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
+    //     If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
+    //     and includes the body of each file in the string which is eventually returned by this function.  If the included
+    //     config file includes other config files, this function will recursively include those files as well.
+    // configString - the config string within which to look for "include" statements
+    // resolvedConfigFiles - the paths to all the config files that have already been resolved.  This vector is used to prevent include loops,
+    //     and to prevent files from being included multiple times.
+    // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
+    std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector<std::string> &resolvedConfigFiles)
+    {
+        std::vector<std::string> lines = msra::strfun::split(configString, "\n");
+        std::string includeKeyword = "include=";
+        std::size_t includeKeywordSize = includeKeyword.size();
+        std::string newConfigString;
+        for (std::string line : lines)
+        {
+            if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
+            {
+                std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
+                if (filePaths.find(openBraceVar) != std::string::npos)
+                {
+                    RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
+                }
+
+                std::vector<std::string> filePathVec = msra::strfun::split (filePaths, "+");
+                for (auto filePath : filePathVec)
+                {
+                    // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
+                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+                    {
+                        // Recursively resolve the include statements in the included config files.
+                        // Ensure that the same config file isn't included twice, by keeping track of the config
+                        // files that have already been resolved in the resolvedPaths vector.
+                        resolvedConfigFiles.push_back(filePath);
+                        newConfigString += ResolveIncludeStatements(
+                                               ReadConfigFile(filePath), 
+                                               resolvedConfigFiles
+                                           );
+                    }
+                    else
+                    {
+                        // We already resolved this path.  Write a warning so that user is aware of this.
+                        // TODO: This message is written to stderr before stderr gets redirected to the specified file.  Fix this.
+                        fprintf(stderr, "Warning: Config file included multiple times.  Not including config file again: %s", filePath.c_str());
+                    }
+                }
+            }
+            else
+            {
+                newConfigString += (line + "\n");
+            }
+        }
+        return newConfigString;
+    }
+
+    // LoadConfigFiles - load multiple configuration file, and adds to config parameters
+    // filePaths - A "+" delimited list of file paths, corresponding to config files to load
+    // configStringToAppend - A config string which should be processed together with the config files
+    void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
+    {
+        std::string configString = ReadConfigFiles(filePaths);
+        if(configStringToAppend != nullptr)
+        {
+            configString += *configStringToAppend;
+        }
+
+        FileParse(configString);
+    }
+
+    // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
+    //     If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
+    //     then this method will resolve those variables.  This method is meant for the processing of NDL/MEL config files,
+    //     in order to allow them to access variables defined in the primary config file via $varName$ syntax.
+    // filePath - filePath to the file to load
+    // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
+    void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
+    {
+        // read file, resolve variables, and then parse.
+        std::string fileContents = ReadConfigFile(filePath);
+        fileContents = config.ResolveVariables(fileContents);
+        FileParse(fileContents);
+    }
+
+    // LoadConfigFile - load a configuration file, and add to config parameters
+    // filePath - filePath to the file to read
+    void ConfigParser::LoadConfigFile(const std::wstring &filePath)
+    {
+        // read and then parse
+        FileParse(ReadConfigFile(filePath));
+    }
+    
+    // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
+    std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
+    {
+        return ReadConfigFiles(msra::strfun::utf16(filePaths));
+    }
+
+    // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
+    // filePaths - A "+" delimited list of file paths, corresponding to config files to read
+    // returns: a string with the concatentated file contents
+    std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
+    {
+        std::string configString;
+        std::vector<std::wstring> filePathVec = msra::strfun::split (filePaths, L"+");
+        for (auto filePath : filePathVec)
+        {
+            configString += ReadConfigFile(filePath);
+        }
+        return configString;
+    }
+
+    // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
+    std::string ConfigParser::ReadConfigFile(const std::string &filePath)
+    {
+        return ReadConfigFile(msra::strfun::utf16(filePath));
+    }
+
+    // ReadConfigFile - read a configuration file, and return as a string
+    // filePath - the path to the config file to read
+    // returns: a string with the concatentated file contents
+    std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
+    {
+        File file(filePath, fileOptionsRead);
+
+        // initialize with file name
+        std::string path = msra::strfun::utf8(filePath);
+        auto location = path.find_last_of("/\\");
+        if (location != npos)
+            path = path.substr(location+1);
+        m_configName = move(path);
+
+        // read the entire file into a string
+        // CONSIDER: should the File API support this, instead of line by line?
+        size_t fileLength = file.Size();
+        string str;
+        string configFile;
+        configFile.reserve(fileLength);
+        while (!file.IsEOF())
+        {
+            file.GetLine(str);
+            str = PreprocessConfigLine(str);
+            if (str != "")
+            {
+                configFile.append(str);
+                configFile.append("\n");
+            }
+        }
+        return configFile;
+    }
+
+    // GetFileConfigNames - determine the names of the features and labels sections in the config file
+    // features - [in,out] a vector of feature name strings
+    // labels - [in,out] a vector of label name strings
+    void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels)
+    {
+        for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
+        {
+            auto pair = *iter;
+            ConfigParameters temp (iter->second);
+            // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
+            if (temp.ExistsCurrent("dim"))
+            {
+                if (temp.ExistsCurrent("labelMappingFile") 
+                    || temp.ExistsCurrent("labelDim")
+                    || temp.ExistsCurrent("labelType")
+                    || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
+                {
+                    labels.push_back(msra::strfun::utf16(iter->first));
+                }
+                else
+                {
+                    features.push_back(msra::strfun::utf16(iter->first));
+                }
+            }
+        }
+    }
+
+    // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
+    // config - configuration to search
+    // key - string we ar searching for in each config section
+    // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
+    void FindConfigNames(const ConfigParameters& config, std::string key, std::vector<std::wstring>& names)
+    {
+        for (auto iter = config.begin(); iter != config.end(); ++iter)
+        {
+            auto pair = *iter;
+            ConfigParameters temp (iter->second);
+            // see if we have a config parameters that contains a "key" element, if so use it
+            if (temp.ExistsCurrent(key))
+            {
+                names.push_back(msra::strfun::utf16(iter->first));
+            }
+        }
+    }
+
+    // Trim - trim white space off the start and end of the string
+    // str - string to trim
+    // NOTE: if the entire string is empty, then the string will be set to an empty string
+    void Trim(std::string& str)
+    {
+        auto found = str.find_first_not_of(" \t");
+        if (found == npos)
+        {
+            str.erase(0);
+            return;
+        }
+        str.erase(0, found);
+        found = str.find_last_not_of(" \t");
+        if (found != npos)
+            str.erase(found+1);
+    }
+
 }}}
\ No newline at end of file
diff --git a/Common/File.cpp b/Common/File.cpp
index fc77c0af3..896b5dd22 100644
--- a/Common/File.cpp
+++ b/Common/File.cpp
@@ -1,631 +1,633 @@
-//
-// <copyright file="File.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "basetypes.h"
-#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
-#include "fileutil.h"
-#include "File.h"
-#include <string>
-#include <stdint.h>
-#include <locale>
-#ifdef _WIN32
-#include <Windows.h>
-#endif
-#ifdef __unix__
-#include <unistd.h>
-#endif
-
-namespace Microsoft{ namespace MSR { namespace CNTK {
-
-// File creation
-// filename - the path
-// fileOptions - options to open the file
-File::File(const std::wstring& filename, int fileOptions)
-{
-    Init(filename.c_str(), fileOptions);
-}
-
-File::File(const std::string& filename, int fileOptions)
-{
-    // this converts from string to wstring, and then to wchar_t*
-    Init(msra::strfun::utf16(filename).c_str(), fileOptions);
-}
-
-File::File(const wchar_t* filename, int fileOptions)
-{
-    Init(filename, fileOptions);
-}
-
-void File::Init(const wchar_t* filename, int fileOptions)
-{
-    msra::files::make_intermediate_dirs(filename);
-    // translate the options string into a string for fopen()
-    wstring options = fileOptions&fileOptionsRead?L"r":L"";
-    if (fileOptions&fileOptionsWrite)
-    {
-        // if we already are reading the file, change to read/write
-        options.clear();
-        options.append(L"w+");
-    }
-    if (fileOptions&fileOptionsBinary)
-    {
-        options += L"b";
-    }
-    else
-    {
-        if (fileOptions & fileOptionsUnicode)
-            options += L"b";    
-        else
-            options += L"t";
-        // I attempted to use the translated characterset modes, but encountered strange errors
-        //options += L"t, ccs=";
-        //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8";
-    }
-    // add sequential flag to allocate big read buffer
-    if (fileOptions & fileOptionsSequential)
-        options += L"S";
-
-    attempt([=](){m_file = fopenOrDie(filename, options.c_str());});
-    m_options = fileOptions;
-    m_size = filesize(m_file);
-}
-
-void File::goToDelimiter(int delim)
-{
-    int ch=0;
-
-    while (ch!=delim) {
-        ch=fgetc(m_file);
-        if (feof(m_file)) {
-            printf("Unexpected end of file\n");
-            throw std::logic_error("Unexpected end of file\n");
-        }
-    }
-}
-
-bool File::IsTextBased()
-{
-    return !!(m_options & (fileOptionsText|fileOptionsUnicode));
-}
-
-// File Destructor
-// closes the file
-File::~File(void)
-{
-    attempt([=] {fcloseOrDie(m_file);});
-}
-
-// GetLine - get a line from the file
-// str - string to store the line
-void File::GetLine(wstring& str)
-{
-    str = fgetlinew(m_file);
-}
-
-// GetLine - get a line from the file
-// str - string 
-void File::GetLine(string& str)
-{
-    str = fgetline(m_file);
-}
-
-// Put a zero/space terminated wstring into a file
-// val - value to write to the file
-File& File::operator<<(const std::wstring& val)
-{
-    WriteString(val.c_str());
-    return *this;
-}
-
-
-// Put a zero/space terminated string into a file
-// val - value to write to the file
-File& File::operator<<(const std::string& val)
-{
-    WriteString(val.c_str());
-    return *this;
-}
-
-// Put a marker in the file, the marker depends on the file type
-// marker - marker to place in the file
-File& File::operator<<(FileMarker marker)
-{
-    File& file = *this;
-    switch(marker)
-    {
-    case fileMarkerBeginFile: // beginning of file marker
-        // only exists for UNICODE files
-        if (m_options & fileOptionsUnicode)
-            file << (unsigned int)0xfeff; // byte order mark
-        break;
-    case fileMarkerEndFile: // end of file marker
-        // use ^Z for end of file for text files
-        if (m_options & fileOptionsUnicode)
-            file << wchar_t(26); // ^Z
-        else if (m_options & fileOptionsText)
-            file << char(26);
-        break;
-    case fileMarkerBeginList: // Beginning of list marker
-        // no marker written for either 
-        break;
-    case fileMarkerListSeparator: // separate elements of a list
-        // do nothing for now, built in space deliminter for all types (before type)
-        // future: make this customizable, so you can specify a separator (i.e. ',')
-        break;
-    case fileMarkerEndList: // end of line/list marker
-        if (m_options & fileOptionsUnicode)
-            file.WriteString(L"\r\n"); // carriage return/life feed
-        else if (m_options & fileOptionsText)
-            file.WriteString("\r\n");
-        break;
-    case fileMarkerBeginSection: // beginning of section
-    case fileMarkerEndSection: // end of section
-        assert(false);  // sections should use a string modifier 
-        break;
-    }
-    return file;
-}
-
-// PutMarker for beginning of list support (lists with a count)
-// count - [in] the number of elements in the list
-File& File::PutMarker(FileMarker marker, size_t count)
-{
-    assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count  markers
-    *this << count;
-    return *this;
-}
-
-// PutMarker for section beginning and ending tags
-// section - [in]name of section
-File& File::PutMarker(FileMarker marker, const std::string& section)
-{
-    File& file = *this;
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    file << section;
-    return file;
-}
-
-// PutMarker for section beginning and ending tags
-// section - [in]name of section
-File& File::PutMarker(FileMarker marker, const std::wstring& section)
-{
-    File& file = *this;
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    file << section;
-    return file;
-}
-
-// Get a zero terminated wstring from a file
-// val - value to read from the file
-File& File::operator>>(std::wstring& val)
-{
-    attempt([&]{
-        if (IsTextBased())
-            val = fgetwtoken(m_file);
-        else
-            val = fgetwstring(m_file);
-    });
-    return *this;
-}
-
-// Get a zero terminated string from a file
-// val - value to read from the file
-File& File::operator>>(std::string& val)
-{
-    attempt([&]{
-        if (IsTextBased())
-            val = fgettoken(m_file);
-        else
-            val = fgetstring(m_file);
-    });
-    return *this;
-}
-
-// ReadChars - read a specified number of characters, and reset read pointer if requested
-// val - [in,out] return value will be returned here
-// cnt - number of characters to read
-// reset - reset the read pointer
-void File::ReadChars(std::string& val, size_t cnt, bool reset)
-{
-    size_t pos = 0; // (initialize to keep compiler happy)
-    if (reset)
-        pos = GetPosition();
-    val.resize(cnt);
-    char *str = const_cast<char *>(val.c_str());
-    for (int i=0;i < cnt;++i)
-        *this >> str[i];
-    if (reset)
-        SetPosition(pos);
-}
-
-// ReadChars - read a specified number of characters, and reset read pointer if requested
-// val - [in,out] return value will be returned here
-// cnt - number of characters to read
-// reset - reset the read pointer
-void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
-{
-    size_t pos = 0; // (initialize to keep compiler happy)
-    if (reset)
-        pos = GetPosition();
-    val.resize(cnt);
-    wchar_t *str = const_cast<wchar_t *>(val.c_str());
-    for (int i=0;i < cnt;++i)
-        *this >> str[i];
-    if (reset)
-        SetPosition(pos);
-}
-
-// WriteString - outputs a string into the file
-// str - the string to output
-// size - size of the string to output, if zero null terminated
-void File::WriteString(const char* str, int size)
-{
-    attempt([&]{
-        if (size > 0)
-        {
-            fwprintf(m_file, L" %.*hs", size, str);
-        }
-        else
-        {
-            if (IsTextBased())
-                fwprintf(m_file, L" %hs", str);
-            else
-                fputstring (m_file, str);
-        }
-    });
-}
-
-// ReadString - reads a string into the file
-// str - the string buffer to read the string into
-// size - size of the string string buffer
-void File::ReadString(char* str, int size)
-{
-    attempt([&]{
-        if (IsTextBased())
-            fgettoken(m_file, str, size);
-        else
-            fgetstring (m_file, str, size);
-    });
-}
-
-// WriteString - outputs a string into the file
-//   if writing to text based file and spaces are embedded, writes quotes around string
-// str - the string to output
-// size - size of the string to output, if zero null terminated
-void File::WriteString(const wchar_t* str, int size)
-{
-    attempt([&]{
-#ifdef EMBEDDED_SPACES
-        // start of implementation of embedded space support with quoting
-        // not complete, not sure if we need it
-        bool spacefound = false;
-        wchar_t quote = 0;
-        if (IsTextBased())
-        {
-            // search for embedded spaces and quotes
-            wstring searchString = L" \"'~";
-            const wchar_t* result = NULL;
-            while (result = wcspbrk(str, searchString.c_str()))
-            {
-                if (IsWhiteSpace(*result))
-                    spacefound = true;
-                searchString.find(*result, 0);
-            }
-        }
-#endif
-        if (size > 0)
-        {
-            fwprintf(m_file, L" %.*ls", size, str);
-        }
-        else
-        {
-            if (IsTextBased())
-                fwprintf(m_file, L" %ls", str);
-            else
-                fputstring (m_file, str);
-        }
-    });
-}
-
-// ReadString - reads a string into the file
-// str - the string buffer to read the string into
-// size - size of the string string buffer
-void File::ReadString(wchar_t* str, int size)
-{
-    attempt([&]{
-        if (IsTextBased())
-            fgettoken(m_file, str, size);
-        else
-            fgetstring (m_file, str, size);
-    });
-}
-
-// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
-// skip - skip the BOM mark if found (defaults to false)
-// returns - true if on a unicode BOM
-bool File::IsUnicodeBOM(bool skip)
-{
-    File& file = *this;
-    uint64_t pos = GetPosition();
-    // if we aren't at the beginning of the file, it can't be the byte order mark
-    if (pos != 0)
-        return false;
-
-    // only exists for UNICODE files
-    bool found = false;
-    if (m_options & fileOptionsUnicode)
-    {
-        unsigned int bom=0;
-        if (IsTextBased())
-            ftrygetText(m_file, bom);
-        else
-            fget(m_file, bom);
-        // future: one reason for the BOM is to detect other-endian files, should we support?
-        found = (bom == 0xfeff);
-    }
-    else if (m_options & fileOptionsText)
-    {
-        char val[3];
-        file.ReadString(val, 3);
-        found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF);
-    }
-    // restore pointer if no BOM or we aren't skipping it
-    if (!found || !skip)
-    {
-        SetPosition(pos);
-    }
-    return found;
-}
-
-//Size - return the size of the file
-// WARNING: calling this will reset the EOF marker, so do so with care
-size_t File::Size()
-{
-    return filesize(m_file);
-}
-
-// IsEOF - if we have read past the end of the file
-// return - true if end of file has been found
-bool File::IsEOF()
-{
-    return !!feof(m_file);
-}
-
-// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
-// skip - skip the whitespace if found (defaults to false)
-// returns - true if whitespace found
-bool File::IsWhiteSpace(bool skip)
-{
-    bool spaceFound = false;
-    bool spaceCur = false;
-    if (m_options & fileOptionsUnicode)
-    {
-        wint_t c;
-        do
-        {
-            c = fgetwc (m_file);
-            if (c == WEOF)       // hit the end
-                return spaceFound;
-            spaceCur = !!iswspace(c);
-            spaceFound = spaceFound || spaceCur;
-        } while (spaceCur && skip);
-        // put back the last character (WEOF is ignored)
-        ungetwc(c, m_file);
-    }
-    else
-    {
-        int c;
-        do
-        {
-            c = fgetc (m_file);
-            if (c == EOF)       // hit the end
-                return spaceFound;
-            spaceCur = !!isspace(c);
-            spaceFound = spaceFound || spaceCur;
-        } while (spaceCur && skip);
-        // put back the last character (EOF is ignored)
-        ungetc(c, m_file);
-    }
-
-    return spaceFound;
-}
-
-// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
-// skip - skip the end of line if found (defaults to false)
-// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
-int File::EndOfLineOrEOF(bool skip)
-{
-    int found = false;
-    if (m_options & fileOptionsUnicode)
-        found = fskipwNewline(m_file,skip);
-    else if (m_options & fileOptionsText)
-        found = fskipNewline(m_file, skip);
-    return found;
-}
-
-
-// Get a marker from the file
-// some are ignored others are expecting characters
-// must use GetMarker methods for those that require parameters
-File& File::operator>>(FileMarker marker)
-{
-    File& file = *this;
-
-    switch(marker)
-    {
-    case fileMarkerBeginFile: // beginning of file marker
-        // check for Unicode BOM marker
-        if (IsTextBased())
-            IsUnicodeBOM(true);
-        break;
-    case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
-        if (!IsEOF())
-            throw std::runtime_error("fileMarkerEndFile not found");
-        break;
-    case fileMarkerBeginList: // Beginning of list marker
-        // no marker written unless an list with a count header
-        break;
-    case fileMarkerListSeparator: // separate elements of a list
-        // do nothing for now, built in space deliminter for all types (before type)
-        // future: make this customizable, so you can specify a separator (i.e. ',')
-        break;
-    case fileMarkerEndList: // end of line/list marker
-        if (IsTextBased())
-        {
-            int found = EndOfLineOrEOF(true);
-            if (found != (int)true) // EOF can also be returned
-                throw std::runtime_error("Newline not found");
-        }
-        break;
-    case fileMarkerBeginSection: // beginning of section
-    case fileMarkerEndSection: // end of section
-        assert(false);  // sections should use a string modifier 
-        break;
-    }
-    return file;
-}
-
-// Get a marker from the file
-// some are ignored others are expecting characters
-// must use GetMarker methods for those that require parameters
-bool File::IsMarker(FileMarker marker, bool skip)
-{
-    bool retval = false;
-    switch(marker)
-    {
-    case fileMarkerBeginFile: // beginning of file marker
-        // check for Unicode BOM marker
-        retval = IsUnicodeBOM(skip);
-        break;
-    case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
-        retval = IsEOF();
-        break;
-    case fileMarkerBeginList: // Beginning of list marker
-        // no marker written unless an list with a count header
-        // should we try to validate BOL header (just know it's an int, not negative, etc.)
-        break;
-    case fileMarkerListSeparator: // separate elements of a list
-        // do nothing for now, built in space deliminter for all types (before type)
-        // future: make this customizable, so you can specify a separator (i.e. ',')
-        break;
-    case fileMarkerEndList: // end of line/list marker
-        if (IsTextBased())
-        {
-            int eolSeen = false;
-            eolSeen = EndOfLineOrEOF(skip);
-            retval = (eolSeen == (int)true);
-        }
-        break;
-    case fileMarkerBeginSection: // beginning of section
-    case fileMarkerEndSection: // end of section
-        // can't destinquish from a string currently
-        break;
-    }
-    return retval;
-}
-
-
-// GetMarker for beginning of list support (lists with a count)
-// count - [out] returns the number of elements in the list
-File& File::GetMarker(FileMarker marker, size_t& count)
-{
-    assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers
-    // use text based try, so it can fail without an exception
-    if (IsTextBased())
-        ftrygetText(m_file, count);
-    else
-        fget(m_file, count);
-    return *this;
-}
-
-// GetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-File& File::GetMarker(FileMarker marker, const std::string& section)
-{
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    string str;
-    *this >> str;
-    if (str != section)
-        throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section);
-    return *this;
-}
-
-// GetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-File& File::GetMarker(FileMarker marker, const std::wstring& section)
-{
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    wstring str;
-    *this >> str;
-    if (str != section)
-        throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section));
-    return *this;
-}
-
-// TryGetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
-{
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    size_t pos = GetPosition();
-    std::wstring str;
-    try
-    {
-        *this >> str;
-        if (str == section)
-            return true;
-    }
-    catch(...)
-    {
-        //eat
-    }
-    SetPosition(pos);
-    return false;
-}
-
-// TryGetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-bool File::TryGetMarker(FileMarker marker, const std::string& section)
-{
-    // only the section markers take a string parameter
-    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
-    size_t pos = GetPosition();
-    std::string str;
-    try
-    {
-        *this >> str;
-        if (str == section)
-            return true;
-    }
-    catch(...)
-    {
-        return false;
-    }
-    SetPosition(pos);
-    return false;
-}
-
-// GetPosition - Get position in a file
-uint64_t File::GetPosition()
-{
-    return fgetpos(m_file);
-}
-
-// Set the position in the file
-// pos - position in the file
-void File::SetPosition(uint64_t pos)
-{
-    fsetpos (m_file, pos);
-}
-
-}}}
+//
+// <copyright file="File.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
+#include "basetypes.h"
+#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
+#include "fileutil.h"
+#include "File.h"
+#include <string>
+#include <stdint.h>
+#include <locale>
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+#ifdef __unix__
+#include <unistd.h>
+#endif
+
+namespace Microsoft{ namespace MSR { namespace CNTK {
+
+// File creation
+// filename - the path
+// fileOptions - options to open the file
+File::File(const std::wstring& filename, int fileOptions)
+{
+    Init(filename.c_str(), fileOptions);
+}
+
+File::File(const std::string& filename, int fileOptions)
+{
+    // this converts from string to wstring, and then to wchar_t*
+    Init(msra::strfun::utf16(filename).c_str(), fileOptions);
+}
+
+File::File(const wchar_t* filename, int fileOptions)
+{
+    Init(filename, fileOptions);
+}
+
+void File::Init(const wchar_t* filename, int fileOptions)
+{
+    msra::files::make_intermediate_dirs(filename);
+    // translate the options string into a string for fopen()
+    wstring options = fileOptions&fileOptionsRead?L"r":L"";
+    if (fileOptions&fileOptionsWrite)
+    {
+        // if we already are reading the file, change to read/write
+        options.clear();
+        options.append(L"w+");
+    }
+    if (fileOptions&fileOptionsBinary)
+    {
+        options += L"b";
+    }
+    else
+    {
+        if (fileOptions & fileOptionsUnicode)
+            options += L"b";    
+        else
+            options += L"t";
+        // I attempted to use the translated characterset modes, but encountered strange errors
+        //options += L"t, ccs=";
+        //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8";
+    }
+    // add sequential flag to allocate big read buffer
+    if (fileOptions & fileOptionsSequential)
+        options += L"S";
+
+    attempt([=](){m_file = fopenOrDie(filename, options.c_str());});
+    m_options = fileOptions;
+    m_size = filesize(m_file);
+}
+
+void File::goToDelimiter(int delim)
+{
+    int ch=0;
+
+    while (ch!=delim) {
+        ch=fgetc(m_file);
+        if (feof(m_file)) {
+            printf("Unexpected end of file\n");
+            throw std::logic_error("Unexpected end of file\n");
+        }
+    }
+}
+
+bool File::IsTextBased()
+{
+    return !!(m_options & (fileOptionsText|fileOptionsUnicode));
+}
+
+// File Destructor
+// closes the file
+File::~File(void)
+{
+    attempt([=] {fcloseOrDie(m_file);});
+}
+
+// GetLine - get a line from the file
+// str - string to store the line
+void File::GetLine(wstring& str)
+{
+    str = fgetlinew(m_file);
+}
+
+// GetLine - get a line from the file
+// str - string 
+void File::GetLine(string& str)
+{
+    str = fgetline(m_file);
+}
+
+// Put a zero/space terminated wstring into a file
+// val - value to write to the file
+File& File::operator<<(const std::wstring& val)
+{
+    WriteString(val.c_str());
+    return *this;
+}
+
+
+// Put a zero/space terminated string into a file
+// val - value to write to the file
+File& File::operator<<(const std::string& val)
+{
+    WriteString(val.c_str());
+    return *this;
+}
+
+// Put a marker in the file, the marker depends on the file type
+// marker - marker to place in the file
+File& File::operator<<(FileMarker marker)
+{
+    File& file = *this;
+    switch(marker)
+    {
+    case fileMarkerBeginFile: // beginning of file marker
+        // only exists for UNICODE files
+        if (m_options & fileOptionsUnicode)
+            file << (unsigned int)0xfeff; // byte order mark
+        break;
+    case fileMarkerEndFile: // end of file marker
+        // use ^Z for end of file for text files
+        if (m_options & fileOptionsUnicode)
+            file << wchar_t(26); // ^Z
+        else if (m_options & fileOptionsText)
+            file << char(26);
+        break;
+    case fileMarkerBeginList: // Beginning of list marker
+        // no marker written for either 
+        break;
+    case fileMarkerListSeparator: // separate elements of a list
+        // do nothing for now, built in space deliminter for all types (before type)
+        // future: make this customizable, so you can specify a separator (i.e. ',')
+        break;
+    case fileMarkerEndList: // end of line/list marker
+        if (m_options & fileOptionsUnicode)
+            file.WriteString(L"\r\n"); // carriage return/life feed
+        else if (m_options & fileOptionsText)
+            file.WriteString("\r\n");
+        break;
+    case fileMarkerBeginSection: // beginning of section
+    case fileMarkerEndSection: // end of section
+        assert(false);  // sections should use a string modifier 
+        break;
+    }
+    return file;
+}
+
+// PutMarker for beginning of list support (lists with a count)
+// count - [in] the number of elements in the list
+File& File::PutMarker(FileMarker marker, size_t count)
+{
+    assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count  markers
+    *this << count;
+    return *this;
+}
+
+// PutMarker for section beginning and ending tags
+// section - [in]name of section
+File& File::PutMarker(FileMarker marker, const std::string& section)
+{
+    File& file = *this;
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    file << section;
+    return file;
+}
+
+// PutMarker for section beginning and ending tags
+// section - [in]name of section
+File& File::PutMarker(FileMarker marker, const std::wstring& section)
+{
+    File& file = *this;
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    file << section;
+    return file;
+}
+
+// Get a zero terminated wstring from a file
+// val - value to read from the file
+File& File::operator>>(std::wstring& val)
+{
+    attempt([&]{
+        if (IsTextBased())
+            val = fgetwtoken(m_file);
+        else
+            val = fgetwstring(m_file);
+    });
+    return *this;
+}
+
+// Get a zero terminated string from a file
+// val - value to read from the file
+File& File::operator>>(std::string& val)
+{
+    attempt([&]{
+        if (IsTextBased())
+            val = fgettoken(m_file);
+        else
+            val = fgetstring(m_file);
+    });
+    return *this;
+}
+
+// ReadChars - read a specified number of characters, and reset read pointer if requested
+// val - [in,out] return value will be returned here
+// cnt - number of characters to read
+// reset - reset the read pointer
+void File::ReadChars(std::string& val, size_t cnt, bool reset)
+{
+    size_t pos = 0; // (initialize to keep compiler happy)
+    if (reset)
+        pos = GetPosition();
+    val.resize(cnt);
+    char *str = const_cast<char *>(val.c_str());
+    for (int i=0;i < cnt;++i)
+        *this >> str[i];
+    if (reset)
+        SetPosition(pos);
+}
+
+// ReadChars - read a specified number of characters, and reset read pointer if requested
+// val - [in,out] return value will be returned here
+// cnt - number of characters to read
+// reset - reset the read pointer
+void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
+{
+    size_t pos = 0; // (initialize to keep compiler happy)
+    if (reset)
+        pos = GetPosition();
+    val.resize(cnt);
+    wchar_t *str = const_cast<wchar_t *>(val.c_str());
+    for (int i=0;i < cnt;++i)
+        *this >> str[i];
+    if (reset)
+        SetPosition(pos);
+}
+
+// WriteString - outputs a string into the file
+// str - the string to output
+// size - size of the string to output, if zero null terminated
+void File::WriteString(const char* str, int size)
+{
+    attempt([&]{
+        if (size > 0)
+        {
+            fwprintf(m_file, L" %.*hs", size, str);
+        }
+        else
+        {
+            if (IsTextBased())
+                fwprintf(m_file, L" %hs", str);
+            else
+                fputstring (m_file, str);
+        }
+    });
+}
+
+// ReadString - reads a string into the file
+// str - the string buffer to read the string into
+// size - size of the string string buffer
+void File::ReadString(char* str, int size)
+{
+    attempt([&]{
+        if (IsTextBased())
+            fgettoken(m_file, str, size);
+        else
+            fgetstring (m_file, str, size);
+    });
+}
+
+// WriteString - outputs a string into the file
+//   if writing to text based file and spaces are embedded, writes quotes around string
+// str - the string to output
+// size - size of the string to output, if zero null terminated
+void File::WriteString(const wchar_t* str, int size)
+{
+    attempt([&]{
+#ifdef EMBEDDED_SPACES
+        // start of implementation of embedded space support with quoting
+        // not complete, not sure if we need it
+        bool spacefound = false;
+        wchar_t quote = 0;
+        if (IsTextBased())
+        {
+            // search for embedded spaces and quotes
+            wstring searchString = L" \"'~";
+            const wchar_t* result = NULL;
+            while (result = wcspbrk(str, searchString.c_str()))
+            {
+                if (IsWhiteSpace(*result))
+                    spacefound = true;
+                searchString.find(*result, 0);
+            }
+        }
+#endif
+        if (size > 0)
+        {
+            fwprintf(m_file, L" %.*ls", size, str);
+        }
+        else
+        {
+            if (IsTextBased())
+                fwprintf(m_file, L" %ls", str);
+            else
+                fputstring (m_file, str);
+        }
+    });
+}
+
+// ReadString - reads a string into the file
+// str - the string buffer to read the string into
+// size - size of the string string buffer
+void File::ReadString(wchar_t* str, int size)
+{
+    attempt([&]{
+        if (IsTextBased())
+            fgettoken(m_file, str, size);
+        else
+            fgetstring (m_file, str, size);
+    });
+}
+
+// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
+// skip - skip the BOM mark if found (defaults to false)
+// returns - true if on a unicode BOM
+bool File::IsUnicodeBOM(bool skip)
+{
+    File& file = *this;
+    uint64_t pos = GetPosition();
+    // if we aren't at the beginning of the file, it can't be the byte order mark
+    if (pos != 0)
+        return false;
+
+    // only exists for UNICODE files
+    bool found = false;
+    if (m_options & fileOptionsUnicode)
+    {
+        unsigned int bom=0;
+        if (IsTextBased())
+            ftrygetText(m_file, bom);
+        else
+            fget(m_file, bom);
+        // future: one reason for the BOM is to detect other-endian files, should we support?
+        found = (bom == 0xfeff);
+    }
+    else if (m_options & fileOptionsText)
+    {
+        char val[3];
+        file.ReadString(val, 3);
+        found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF);
+    }
+    // restore pointer if no BOM or we aren't skipping it
+    if (!found || !skip)
+    {
+        SetPosition(pos);
+    }
+    return found;
+}
+
+//Size - return the size of the file
+// WARNING: calling this will reset the EOF marker, so do so with care
+size_t File::Size()
+{
+    return filesize(m_file);
+}
+
+// IsEOF - if we have read past the end of the file
+// return - true if end of file has been found
+bool File::IsEOF()
+{
+    return !!feof(m_file);
+}
+
+// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
+// skip - skip the whitespace if found (defaults to false)
+// returns - true if whitespace found
+bool File::IsWhiteSpace(bool skip)
+{
+    bool spaceFound = false;
+    bool spaceCur = false;
+    if (m_options & fileOptionsUnicode)
+    {
+        wint_t c;
+        do
+        {
+            c = fgetwc (m_file);
+            if (c == WEOF)       // hit the end
+                return spaceFound;
+            spaceCur = !!iswspace(c);
+            spaceFound = spaceFound || spaceCur;
+        } while (spaceCur && skip);
+        // put back the last character (WEOF is ignored)
+        ungetwc(c, m_file);
+    }
+    else
+    {
+        int c;
+        do
+        {
+            c = fgetc (m_file);
+            if (c == EOF)       // hit the end
+                return spaceFound;
+            spaceCur = !!isspace(c);
+            spaceFound = spaceFound || spaceCur;
+        } while (spaceCur && skip);
+        // put back the last character (EOF is ignored)
+        ungetc(c, m_file);
+    }
+
+    return spaceFound;
+}
+
+// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
+// skip - skip the end of line if found (defaults to false)
+// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
+int File::EndOfLineOrEOF(bool skip)
+{
+    int found = false;
+    if (m_options & fileOptionsUnicode)
+        found = fskipwNewline(m_file,skip);
+    else if (m_options & fileOptionsText)
+        found = fskipNewline(m_file, skip);
+    return found;
+}
+
+
+// Get a marker from the file
+// some are ignored others are expecting characters
+// must use GetMarker methods for those that require parameters
+File& File::operator>>(FileMarker marker)
+{
+    File& file = *this;
+
+    switch(marker)
+    {
+    case fileMarkerBeginFile: // beginning of file marker
+        // check for Unicode BOM marker
+        if (IsTextBased())
+            IsUnicodeBOM(true);
+        break;
+    case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
+        if (!IsEOF())
+            throw std::runtime_error("fileMarkerEndFile not found");
+        break;
+    case fileMarkerBeginList: // Beginning of list marker
+        // no marker written unless an list with a count header
+        break;
+    case fileMarkerListSeparator: // separate elements of a list
+        // do nothing for now, built in space deliminter for all types (before type)
+        // future: make this customizable, so you can specify a separator (i.e. ',')
+        break;
+    case fileMarkerEndList: // end of line/list marker
+        if (IsTextBased())
+        {
+            int found = EndOfLineOrEOF(true);
+            if (found != (int)true) // EOF can also be returned
+                throw std::runtime_error("Newline not found");
+        }
+        break;
+    case fileMarkerBeginSection: // beginning of section
+    case fileMarkerEndSection: // end of section
+        assert(false);  // sections should use a string modifier 
+        break;
+    }
+    return file;
+}
+
+// Get a marker from the file
+// some are ignored others are expecting characters
+// must use GetMarker methods for those that require parameters
+bool File::IsMarker(FileMarker marker, bool skip)
+{
+    bool retval = false;
+    switch(marker)
+    {
+    case fileMarkerBeginFile: // beginning of file marker
+        // check for Unicode BOM marker
+        retval = IsUnicodeBOM(skip);
+        break;
+    case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
+        retval = IsEOF();
+        break;
+    case fileMarkerBeginList: // Beginning of list marker
+        // no marker written unless an list with a count header
+        // should we try to validate BOL header (just know it's an int, not negative, etc.)
+        break;
+    case fileMarkerListSeparator: // separate elements of a list
+        // do nothing for now, built in space deliminter for all types (before type)
+        // future: make this customizable, so you can specify a separator (i.e. ',')
+        break;
+    case fileMarkerEndList: // end of line/list marker
+        if (IsTextBased())
+        {
+            int eolSeen = false;
+            eolSeen = EndOfLineOrEOF(skip);
+            retval = (eolSeen == (int)true);
+        }
+        break;
+    case fileMarkerBeginSection: // beginning of section
+    case fileMarkerEndSection: // end of section
+        // can't destinquish from a string currently
+        break;
+    }
+    return retval;
+}
+
+
+// GetMarker for beginning of list support (lists with a count)
+// count - [out] returns the number of elements in the list
+File& File::GetMarker(FileMarker marker, size_t& count)
+{
+    assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers
+    // use text based try, so it can fail without an exception
+    if (IsTextBased())
+        ftrygetText(m_file, count);
+    else
+        fget(m_file, count);
+    return *this;
+}
+
+// GetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+File& File::GetMarker(FileMarker marker, const std::string& section)
+{
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    string str;
+    *this >> str;
+    if (str != section)
+        throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section);
+    return *this;
+}
+
+// GetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+File& File::GetMarker(FileMarker marker, const std::wstring& section)
+{
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    wstring str;
+    *this >> str;
+    if (str != section)
+        throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section));
+    return *this;
+}
+
+// TryGetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
+{
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    size_t pos = GetPosition();
+    std::wstring str;
+    try
+    {
+        *this >> str;
+        if (str == section)
+            return true;
+    }
+    catch(...)
+    {
+        //eat
+    }
+    SetPosition(pos);
+    return false;
+}
+
+// TryGetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+bool File::TryGetMarker(FileMarker marker, const std::string& section)
+{
+    // only the section markers take a string parameter
+    assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+    size_t pos = GetPosition();
+    std::string str;
+    try
+    {
+        *this >> str;
+        if (str == section)
+            return true;
+    }
+    catch(...)
+    {
+        return false;
+    }
+    SetPosition(pos);
+    return false;
+}
+
+// GetPosition - Get position in a file
+uint64_t File::GetPosition()
+{
+    return fgetpos(m_file);
+}
+
+// Set the position in the file
+// pos - position in the file
+void File::SetPosition(uint64_t pos)
+{
+    fsetpos (m_file, pos);
+}
+
+}}}
diff --git a/Common/Include/TimerUtility.h b/Common/Include/TimerUtility.h
new file mode 100644
index 000000000..c964f4282
--- /dev/null
+++ b/Common/Include/TimerUtility.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#define MS_PER_SEC 1000
+
+namespace Microsoft{namespace MSR {namespace CNTK {
+    class Timer
+    {
+    public:
+        Timer(){};
+        ~Timer(){};
+        static unsigned long long MilliSecondElapsed();
+    };
+}}}
diff --git a/Common/TimerUtility.cpp b/Common/TimerUtility.cpp
new file mode 100644
index 000000000..f0fe29342
--- /dev/null
+++ b/Common/TimerUtility.cpp
@@ -0,0 +1,39 @@
+#include "TimerUtility.h"
+
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <time.h>
+#endif
+namespace Microsoft{
+    namespace MSR {
+        namespace CNTK {
+
+            //Returns the amount of milliseconds elapsed
+            unsigned long long Timer::MilliSecondElapsed()
+            {
+#ifdef WIN32
+                FILETIME ft;
+                LARGE_INTEGER li;
+
+                GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
+                li.LowPart = ft.dwLowDateTime;
+                li.HighPart = ft.dwHighDateTime;
+
+                unsigned long long ret = li.QuadPart;
+                ret -= 116444736000000000LL; // Make the values consistent with Linux. 
+                ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3) 
+
+                return ret;
+#else
+                timespec ts;
+                clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
+
+                UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
+
+                return ret;
+#endif
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/Common/fileutil.cpp b/Common/fileutil.cpp
index 4b2e3c565..9f6b6b134 100644
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@@ -4,7 +4,10 @@
 // </copyright>
 //
 
-#define _CRT_SECURE_NO_WARNINGS     // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
 #pragma warning (disable: 4996)     // ^^ this does not seem to work--TODO: make it work
 #define _FILE_OFFSET_BITS 64        // to force fseeko() and ftello() 64 bit in Linux
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp
index ebb659525..2269d8779 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@@ -49,17 +49,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_truncated = readerConfig("Truncated", "false");
         m_convertLabelsToTargets = false;
 
-        m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+        ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
+        m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;
 
-        if (m_numberOfuttsPerMinibatch < 1)
+        for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
         {
-            LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
+            if (m_numberOfuttsPerMinibatch < 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            }
+
+            if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
+            }
         }
 
-        if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
-        {
-            LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
-        }
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];
 
         m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
         m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // get the read method, defaults to "blockRandomize" other option is "rollingWindow"
         std::string readMethod(readerConfig("readMethod","blockRandomize"));
 
+        if (readMethod == "blockRandomize" && randomize == randomizeNone)
+        {
+            fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
+            randomize = randomizeAuto;
+        }
+
         // see if they want to use readAhead
         m_readAhead = readerConfig("readAhead", "false");
 
@@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             // now get the frame source. This has better randomization and doesn't create temp files
             m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
+			m_frameSource->setverbosity(verbosity);
             //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);
 
         }
@@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         m_mbSize = mbSize;
 
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
+
+        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
+        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
+        m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
+        m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
+        m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
+
         if (m_trainOrTest)
         {
             StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.h b/DataReader/HTKMLFReader/HTKMLFReader.h
index 3b7692f4b..a4e90da3d 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 //
 // <copyright file="HTKMLFReader.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
@@ -111,4 +112,117 @@ public:
     void SetSentenceEnd(int /*actualMbSize*/){};
 };
 
+=======
+//
+// <copyright file="HTKMLFReader.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
+#pragma once
+#include "DataReader.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+class HTKMLFReader : public IDataReader<ElemType>
+{
+private:
+    msra::dbn::minibatchiterator* m_mbiter;
+    msra::dbn::minibatchsource* m_frameSource;
+    msra::dbn::minibatchreadaheadsource* m_readAheadSource;
+     msra::dbn::FileEvalSource* m_fileEvalSource; 
+    msra::dbn::latticesource* m_lattices;
+    map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
+    
+    vector<bool> m_sentenceEnd;
+    bool m_readAhead;
+    bool m_truncated;
+    vector<size_t> m_processedFrame;
+    intargvector m_numberOfuttsPerMinibatchForAllEpochs;
+    size_t m_numberOfuttsPerMinibatch;
+    size_t m_actualnumberOfuttsPerMinibatch;
+    size_t m_mbSize;
+    vector<size_t> m_toProcess;
+    vector<size_t> m_switchFrame;
+    bool m_noData;
+
+    bool m_trainOrTest; // if false, in file writing mode
+ 
+    std::map<LabelIdType, LabelType> m_idToLabelMap;
+    
+    bool m_partialMinibatch; // allow partial minibatches?
+    
+    std::vector<ElemType*> m_featuresBufferMultiUtt;
+    std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
+    std::vector<ElemType*> m_labelsBufferMultiUtt;
+    std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
+    std::vector<size_t> m_featuresStartIndexMultiUtt;
+    std::vector<size_t> m_labelsStartIndexMultiUtt;
+
+    std::vector<ElemType*> m_featuresBufferMultiIO;
+    std::vector<size_t> m_featuresBufferAllocatedMultiIO;
+    std::vector<ElemType*> m_labelsBufferMultiIO;
+    std::vector<size_t> m_labelsBufferAllocatedMultiIO;
+
+    std::map<std::wstring,size_t> m_featureNameToIdMap;
+    std::map<std::wstring,size_t> m_labelNameToIdMap;
+    std::map<std::wstring,size_t> m_nameToTypeMap;
+    std::map<std::wstring,size_t> m_featureNameToDimMap;
+    std::map<std::wstring,size_t> m_labelNameToDimMap;
+    // for writing outputs to files (standard single input/output network) - deprecate eventually
+    bool m_checkDictionaryKeys;
+    bool m_convertLabelsToTargets;
+    std::vector <bool> m_convertLabelsToTargetsMultiIO;
+    std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
+ 
+    size_t m_inputFileIndex;
+    std::vector<size_t> m_featDims;
+    std::vector<size_t> m_labelDims;
+
+    std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
+     
+    void PrepareForTrainingOrTesting(const ConfigParameters& config);
+    void PrepareForWriting(const ConfigParameters& config);
+    
+    bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    
+    void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+
+    bool ReNewBufferForMultiIO(size_t i);
+
+    size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} 
+    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+
+     void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
+
+    
+    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    enum InputOutputTypes
+    {
+        real,
+        category,
+    };
+
+
+
+public:
+    virtual void Init(const ConfigParameters& config);
+    virtual void Destroy() {delete this;}
+    virtual ~HTKMLFReader();
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
+    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+
+    virtual bool DataEnd(EndDataType endDataType);
+    void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
+    void SetSentenceEnd(int /*actualMbSize*/){};
+};
+
+>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
 }}}
\ No newline at end of file
diff --git a/DataReader/HTKMLFReader/rollingwindowsource.h b/DataReader/HTKMLFReader/rollingwindowsource.h
index a3babcb13..7d5e253cc 100644
--- a/DataReader/HTKMLFReader/rollingwindowsource.h
+++ b/DataReader/HTKMLFReader/rollingwindowsource.h
@@ -1,817 +1,817 @@
-//
-// <copyright file="rollingwindowsource.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file
-//
-
-#pragma once
-
-#include "basetypes.h"                  // for attempt()
-#include "numahelpers.h"                // for NUMA allocation
-#include "minibatchsourcehelpers.h"
-#include "minibatchiterator.h"
-#include "biggrowablevectors.h"
-#include "ssematrix.h"
-
-namespace msra { namespace dbn {
-
-    // ---------------------------------------------------------------------------
-    // biggrowablevectorarray -- a big array of vectors for features, growable (push_back)
-    // Data is striped across NUMA nodes, as to not clog them up.
-    // This also supports paging to disk, which is used for the old minibatchframesource.
-    // ---------------------------------------------------------------------------
-    class biggrowablevectorarray : public growablevectorbase<msra::dbn::matrix>
-    {
-        size_t m;           // dim
-
-        size_t inmembegin;  // range we have in memory, rounded to enclosing blocks (not rounded at end)
-        size_t inmemend;
-
-        wstring pagepath;   // path for paging, empty if no paging
-        auto_file_ptr f;    // file handle for paging
-        bool reading;       // have we begun reading?
-
-        // allocate a block
-        msra::dbn::matrix * newblock() const
-        {
-            // we stripe the data across NUMA nodes as to not fill up one node with the feature data
-            msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode());
-            msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock);
-            msra::numa::overridenode (-1);  // note: we really should reset it also in case of failure
-            return res;
-        }
-
-        // handling of page file
-        bool paging() const { return !pagepath.empty(); }
-        void openpagefile (bool wantread)
-        {
-            if (!paging()) return;
-            msra::files::make_intermediate_dirs (pagepath);
-
-            if (!wantread)
-            {
-                FILE *ftry = NULL;
-                wstring pathname (pagepath);
-                ftry = _wfopen (pathname.c_str(), L"wbS");
-                if (ftry) fclose (ftry);
-            }
-
-            /* 
-                code below to cycle through a-z appended to file name is no longer necessary 
-                since caller guarantees unique file names via HTKMLFReader 
-                and we want the pagepath logged to the user to be the actual one used by the code
-
-            // try to open the pagepath from a to z
-            if (!wantread)
-            {
-                FILE *ftry = NULL;
-                char trynum = 'a';
-                while (!ftry && trynum <= 'z')
-                {
-                    wstring pathname (pagepath);
-                    pathname += trynum++;
-                    ftry = _wfopen (pathname.c_str(), L"wbS");
-                }
-                if (ftry) fclose (ftry);
-                pagepath += --trynum;
-            }
-            */
-            f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS");
-            reading = wantread;
-        }
-        void flushlastblock()   // during population phase, must be called once per block in sequence
-        {
-            if (!paging()) return;
-            assert (!reading);
-            if (blocks.empty()) return;
-            const size_t blockid = blocks.size() -1;
-            msra::dbn::matrix & block = *blocks[blockid];
-            assert (fgetpos (f) == blockid * block.sizeinpagefile());
-            block.topagefile (f);
-            blocks[blockid].reset();    // free the memory
-            assert (blockid * elementsperblock == inmembegin);
-            inmembegin = inmemend;      // empty range
-        }
-        void releaseblock (size_t t0)   // t0=block start time
-        {
-            assert (paging() && reading);
-            size_t blockid = t0 / elementsperblock;
-            assert (blockid * elementsperblock == t0);
-            assert (blocks[blockid]);
-            fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
-            blocks[blockid].reset();    // free the memory
-        }
-        void recoverblock (size_t t0)   // t0=block start time
-        {
-            assert (paging() && reading);
-            size_t blockid = t0 / elementsperblock;
-            assert (blockid * elementsperblock == t0);
-            assert (!blocks[blockid]);
-            fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
-            blocks[blockid].reset (newblock());
-            msra::dbn::matrix & block = *blocks[blockid];
-            fsetpos (f, blockid * block.sizeinpagefile());
-            block.frompagefile (f);
-        }
-        
-    public:
-        biggrowablevectorarray (const wstring & pagepath)
-            : growablevectorbase (65536), m (0), 
-            inmembegin (0), inmemend (0), pagepath (pagepath), reading (false)
-        {
-            openpagefile (false);
-            if (paging())
-                fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str());
-        }
-        ~biggrowablevectorarray() { // clean up the big temp file 
-            if (paging()) {
-                fclose (f); 
-                if (_wunlink (pagepath.c_str())==0)
-                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
-                else
-                    fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str());
-            }
-        }            
-        
-        size_t dim() const { return m; }    // dimension of a frame
-
-        // reading phase
-        void push_back (const std::vector<float> & in)
-        {
-            assert (!in.empty());
-            assert (m == 0 || m == in.size());
-            m = in.size();
-            const size_t blockid = n / elementsperblock;
-            assert (blockid <= blocks.size());
-            if (blockid == blocks.size())   // a new block is needed
-            {
-                flushlastblock();
-                blocks.push_back (std::unique_ptr<msra::dbn::matrix> (newblock()));
-            }
-            const size_t blockn = n % elementsperblock;
-            msra::dbn::matrix & block = *blocks[blockid].get();
-            foreach_index (k, in)
-                block(k,blockn) = in[k];
-            n++;
-            inmemend = n;
-        }
-        void no_more_push_back()    // done pushing --switch to consumption mode
-        {
-            if (!paging()) return;
-            // finish off last block
-            flushlastblock();
-            fflushOrDie (f);
-            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f));
-            fclose (f);
-            foreach_index (i, blocks) assert (!blocks[i]);   // ensure we flushed
-            assert (inmembegin == inmemend);    // nothing in cache
-            // switch to reading mode
-            openpagefile (true);
-        }
-
-        // access phase
-        // Returns 'true' if data was actually read from disk.
-        bool require (pair<size_t,size_t> bounds) // we require this range of frames
-        {
-            bool readfromdisk = false;
-
-            // get bounds rounded to block boundaries
-            const size_t ts = bounds.first / elementsperblock * elementsperblock;
-            const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock);
-            assert (paging());
-            // free all the memmory
-            for (size_t t = inmembegin; t < inmemend; t += elementsperblock)
-            {
-                if (t >= ts && t < te)  // if in wanted range then skip to end of it
-                    t = te - elementsperblock;
-                else
-                    releaseblock (t);
-            }
-            // page in all required blocks
-            for (size_t t = ts; t < te; t += elementsperblock)
-            {
-                if (t >= inmembegin && t < inmemend)  // if in memory already then skip to end of it
-                    t = inmemend - elementsperblock;
-                else
-                {
-                    recoverblock (t);
-                    readfromdisk = true;            // tell caller we did something expensive
-                }
-            }
-            // got it
-            inmembegin = ts;
-            inmemend = te;
-            return readfromdisk;
-        }
-        const msra::dbn::matrixstripe operator[] (size_t t) const   // get a feature vector
-        {
-            if (t < inmembegin || t >= inmemend)
-                throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first");
-            const size_t blockt = getblockt (t);
-            /*const*/ msra::dbn::matrix & block = getblock (t);
-            return msra::dbn::matrixstripe (block, blockt, 1);
-        }
-        wstring pagepathname(){ return pagepath;}
-        void cleanuppagefile()
-        {
-            if (paging()) {
-                fclose (f); 
-                if (_wunlink (pagepath.c_str())==0){
-                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
-                }
-                else{
-                    fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str());
-                }
-            }
-        }
-    };
-
-    // ---------------------------------------------------------------------------
-    // minibatchframesource -- feature source to provide randomized frames in minibatches
-    // This is the old code that pages all frames to a huge disk file first.
-    // (The new minibatchutterancesource pages from input files directly and can also 
-    // operate in utterance mode for MMI training.)
-    // ---------------------------------------------------------------------------
-    class minibatchframesource : public minibatchsource
-    {
-        size_t vdim;                        // feature dimension after augmenting neighhors (0: don't read features)
-        unsigned int sampperiod;            // (for reference and to check against model)
-        string featkind;
-        size_t featdim;
-        // cache
-        biggrowablevectorarray frames;      // [t][i] all features concatenated
-        std::vector<char> boundaryflags;    // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
-        std::vector<CLASSIDTYPE> classids;  // [t] the state that the frame belongs to
-        size_t numframes;                   // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
-        msra::dbn::randomordering randomordering;  // [t] -> t'
-        double timegetbatch;
-        int verbosity;
-    public:
-        // constructor
-        // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
-        minibatchframesource (const std::vector<wstring> & infiles, const map<wstring,std::vector<msra::asr::htkmlfentry>> & labels,
-            size_t vdim, size_t udim, size_t randomizationrange, const wstring & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
-            : vdim (vdim), sampperiod (0), featdim (0), numframes (0), frames (pagepath), timegetbatch (0), verbosity(2)
-        {
-            if (vdim == 0 && labels.empty())
-                throw runtime_error ("minibatchframesource: when running without features, labels are needed");
-            // at this stage, we simply page in the entire training set at once and work off RAM
-            // We will benefit from feature archives indirectly through htkfeatio.
-            // TODO:
-            //  - infiles must specify time range
-            //  - at this stage only reserve() (we know the time range; allocate second-layer structure)
-            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
-            featkind.clear();
-            std::vector<float> frame;
-            fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size());
-            size_t numclasses = 0;              // number of units found (actually max id +1)
-            size_t notfound = 0;                // number of entries missing in MLF
-            msra::asr::htkfeatreader reader;    // feature reader
-            reader.AddEnergy(addEnergy);
-
-            foreach_index (i, infiles)
-            {
-                if (i % (infiles.size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
-                msra::basetypes::matrix<float> feat;
-                msra::asr::htkfeatreader::parsedpath ppath (infiles[i]);
-
-                // skip files for which labels don't exist (assuming bad alignment)
-                wstring key;
-                if (!labels.empty())    // empty means unsupervised mode (don't load any)
-                {
-                    key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
-                    if (labels.find (key) == labels.end())
-                    {
-                        if (notfound < 5)
-                            fprintf (stderr, "\nminibatchframesource: %d-th file not found in MLF label set: %S", i, key.c_str());
-                        notfound++;
-                        continue;   // skip this utterance at all
-                    }
-                }
-
-                // get feature frames
-                if (vdim != 0)  // (vdim == special mode to not read features at all)
-                {
-                    msra::util::attempt (5, [&]()
-                    {
-                        reader.read (ppath, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
-                    });
-                    if (featdim == 0)   // first time
-                        featdim = feat.rows();
-                    else if (featdim != feat.rows())
-                        throw std::runtime_error ("minibatchframesource: inconsistent feature dimension across files");
-                    // HVite occasionally generates mismatching output --skip such files
-                    if (!key.empty())   // (we have a key if supervised mode)
-                    {
-                        const auto & labseq = labels.find (key)->second;    // (we already checked above that it exists)
-                        size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
-                        if (abs ((int) labframes - (int) feat.cols()) > 0)
-                        {
-                            fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
-                            notfound++;
-                            continue;   // skip this utterance at all
-                        }
-                    }
-                    // append to cache
-                    frame.resize (featdim);
-                    if (feat.cols() < 2)    // (2 frames needed for boundary markers)
-                        throw std::runtime_error ("minibatchframesource: utterances < 2 frames not supported");
-                    foreach_column (t, feat)
-                    {
-                        foreach_index (k, frame)
-                            frame[k] = feat(k,t);
-                        frames.push_back (frame);
-                        numframes++;
-                        boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
-                    }
-                    assert (numframes == frames.size());
-                    assert (numframes == boundaryflags.size());
-                }
-
-                // get label sequence
-                if (!key.empty())   // (we have a key if supervised mode)
-                {
-                    const auto & labseq = labels.find (key)->second;    // (we already checked above that it exists)
-                    foreach_index (i, labseq)
-                    {
-                        const auto & e = labseq[i];
-                        if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                            throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                        for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                        {
-                            if (e.classid >= udim)
-                                throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: class id exceeds model dimension in file %S", key.c_str()));
-                            if (e.classid != (CLASSIDTYPE) e.classid)
-                                throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                            classids.push_back ((CLASSIDTYPE) e.classid);
-                            numclasses = max (numclasses, 1u + e.classid);
-                        }
-                    }
-                    if (vdim == 0)
-                        numframes = classids.size();
-                    if (numframes != classids.size())   // TODO: remove this once we are confident
-                        throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                    assert (numframes == classids.size());
-                }
-                else
-                {
-                    assert (classids.empty());  // that's how we detect it later
-                }
-            }
-            assert (vdim == 0 || numframes == frames.size());
-            assert (labels.empty() || numframes == classids.size());
-            if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size()))
-                throw std::runtime_error ("minibatchframesource: numframes variable screwup");
-            fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses);
-            if (notfound > 0)
-            {
-                fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size());
-                if (notfound > infiles.size() / 2)
-                    throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n");
-            }
-
-            if (numframes == 0 && !mayhavenoframe)
-                throw std::runtime_error ("minibatchframesource: no input features given!");
-
-            // notify frames source to switch from population to consumption mode
-            frames.no_more_push_back();
-
-            // initialize randomizer
-            if (numframes > 0) 
-                randomordering.resize (numframes, randomizationrange);
-        }
-        virtual ~minibatchframesource() {}
-        size_t totalframes() const { assert (vdim == 0 || numframes == frames.size()); assert (!issupervised() || numframes == classids.size()); return numframes; }
-
-        bool issupervised() const { return !classids.empty(); }
-
-        void setverbosity(int newverbosity) { verbosity = newverbosity; }
-
-        // retrieve one minibatch
-        // Minibatches are deterministic pseudo-random samples. The entire corpus
-        // is repeated infinitely, but each repetition (a 'sweep') is randomized
-        // differently.
-        // This function allows to retrieve a mini-batch starting from any frame
-        // within this infinitely extended repetition. To the end, mini-batches are
-        // specified by start frame and #frames.
-        // This function returns the same data independent on #frames, i.e. the concept
-        // of the mini-batch is not defined in here, but on the caller side. The caller
-        // can retrieve the frames of a mini-batch in chunks that do not match the
-        // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
-        // If a requested mini-batch spans a sweep boundary, then this function will
-        // not return samples after the sweep boundary. Instead, the returned frame
-        // set is shortened to not exceed the end of the sweep. The caller must make
-        // a separate second call to get the rest. In trainlayer(), the one
-        // sweep-boundary-spanning mini-batch will simply be shortened.
-        // This function is NOT thread-safe (due to caching of random sequence).
-        bool getbatch (const size_t globalts, const size_t framesrequested, msra::dbn::matrix & feat, std::vector<size_t> & uids,
-            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
-            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
-        {
-            auto_timer timergetbatch;
-
-            transcripts.clear();    // word-level transcripts not supported by frame source (aimed at MMI)
-            latticepairs.clear();   // neither are lattices
-
-            assert (totalframes() > 0);
-            const size_t sweep = globalts / totalframes();  // which sweep (this determines randomization)
-            const size_t ts = globalts % totalframes();     // start frame within the sweep
-            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
-            assert (te > ts);
-            if (verbosity >= 2)
-                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
-
-            // get random sequence (each time index occurs exactly once)
-            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
-            const auto & tmap = randomordering (sweep);
-
-            // page in the needed range of frames
-            const size_t extent = augmentationextent (frames.dim(), vdim);
-            bool readfromdisk = frames.require (randomordering.bounds (max (ts, extent) - extent, te + 1 + extent));
-
-            // generate features and uids
-            feat.resize (vdim, te - ts);    // note: special mode vdim == 0 means no features to be loaded
-            if (issupervised())             // empty means unsupervised training -> return empty uids
-                uids.resize (te - ts);
-            else
-                uids.clear();
-            for (size_t t = ts; t < te; t++)
-            {
-                size_t trand = tmap[t];     // the random-sequence sample point for this point in time
-                if (vdim != 0)
-                {
-                    auto v_t = feat.col(t-ts); // the vector to fill in
-                    augmentneighbors (frames, boundaryflags, trand, v_t);
-                }
-                if (issupervised())
-                    uids[t-ts] = classids[trand];
-            }
-            timegetbatch = timergetbatch;
-            return readfromdisk;
-        }
-
-        bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
-            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
-            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
-        {
-            // for single input/output set size to be 1 and run old getbatch
-            feat.resize(1);
-            uids.resize(1);
-            //transcripts.resize(1);
-            //latticepairs.resize(1);
-            return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
-        }
-
-        double gettimegetbatch () { return timegetbatch;}
-
-        // return first valid globalts to ask getbatch() for
-        // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
-        /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
-
-        /*implement*/ const std::vector<size_t> & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); static std::vector<size_t> x; return x;/*keep compiler happy*/ }
-    };
-
-    // ---------------------------------------------------------------------------
-    // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches
-    // this is derived from minibatchframesource but worked with multiple inputs and/or outputs
-    // by making "frames" and "classids" a vector of vectors
-    // ---------------------------------------------------------------------------
-    class minibatchframesourcemulti : public minibatchsource
-    {
-        std::vector<size_t> vdim;                       // feature dimension after augmenting neighhors (0: don't read features)
-        std::vector<size_t> leftcontext;                // number of frames to the left of the target frame in the context window
-        std::vector<size_t> rightcontext;               // number of frames to the right of the target frame in the context window
-        unsigned int sampperiod;            // (for reference and to check against model)
-        string featkind;
-        size_t featdim;
-        size_t maxvdim;
-        // cache
-        //std::vector<biggrowablevectorarray> frames;
-        std::vector<unique_ptr<biggrowablevectorarray>> pframes;      // [t][i] all features concatenated
-        std::vector<char> boundaryflags;    // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
-        std::vector<std::vector<CLASSIDTYPE>> classids;  // [t] the state that the frame belongs to
-        size_t numframes;                   // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
-        msra::dbn::randomordering randomordering;  // [t] -> t'
-        double timegetbatch;
-        int verbosity;
-
-    public:
-        // constructor
-        // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
-        minibatchframesourcemulti (const std::vector<std::vector<wstring>> & infiles, const std::vector<map<std::wstring,std::vector<msra::asr::htkmlfentry>>> & labels,
-            std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange, const std::vector<wstring> & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
-            : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), numframes (0), timegetbatch (0), verbosity(2), maxvdim(0)
-        {
-
-            if (vdim[0] == 0 && labels.empty())
-                throw runtime_error ("minibatchframesourcemulti: when running without features, labels are needed");
-            // at this stage, we simply page in the entire training set at once and work off RAM
-            // We will benefit from feature archives indirectly through htkfeatio.
-            // TODO:
-            //  - infiles must specify time range
-            //  - at this stage only reserve() (we know the time range; allocate second-layer structure)
-            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
-            featkind.clear();
-            std::vector<float> frame;
-            std::vector<size_t>numclasses;              // number of units found (actually max id +1)
-            size_t notfound = 0;                // number of entries missing in MLF
-
-
-            std::vector<size_t>framesaccum;
-
-            if (infiles.size()==0)
-                throw runtime_error("minibatchframesourcemulti: need at least one network input specified with features");
-
-            if (labels.size()==0)
-                fprintf(stderr,"no MLF label files detected\n");
- 
-            foreach_index (i, infiles)
-            {
-                pframes.push_back(unique_ptr<biggrowablevectorarray>(new biggrowablevectorarray(pagepath[i])));
-
-                if (vdim[i]>maxvdim)
-                    maxvdim=vdim[i];
-            }
-
-
-            foreach_index (i, labels)
-            {
-                classids.push_back(std::vector<CLASSIDTYPE>());
-                numclasses.push_back(0);
-            }
-
-
-            fprintf (stderr, "minibatchframesourcemulti: reading %d feature sets and %d label sets...", infiles.size(),labels.size());
-
-            foreach_index (m, infiles)
-            {
-
-
-                featdim=0;
-                numframes=0;
-                featkind.clear();
-                msra::asr::htkfeatreader reader;    // feature reader
-                reader.AddEnergy(addEnergy);
-
-                foreach_index (i, infiles[m]) // read each feature file in set m
-                {
-                    if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
-                    msra::basetypes::matrix<float> feat;
-                    msra::asr::htkfeatreader::parsedpath ppath (infiles[m][i]);
-
-                    // skip files for which labels don't exist (assuming bad alignment)
-                    wstring key;
-                    if (!labels.empty())
-                    {
-                        if (!labels[0].empty())    // empty means unsupervised mode (don't load any)
-                        {
-                            key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
-                            if (labels[0].find (key) == labels[0].end())
-                            {
-                                if (notfound < 5)
-                                    fprintf (stderr, "\nminibatchframesourcemulti: %d-th file not found in MLF label set: %S", i, key.c_str());
-                                notfound++;
-                                continue;   // skip this utterance at all
-                            }
-                        }
-                    }
-                    // get feature frames
-                    if (vdim[m] != 0)  // (vdim == special mode to not read features at all)
-                    {
-                        msra::util::attempt (5, [&]()
-                        {
-                            reader.read (ppath, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
-                        });
-                        if (featdim == 0)   // first time
-                            featdim = feat.rows();
-                        else if (featdim != feat.rows())
-                            throw std::runtime_error ("minibatchframesourcemulti: inconsistent feature dimension across files");
-                        // HVite occasionally generates mismatching output --skip such files
-                        if (!key.empty())   // (we have a key if supervised mode)
-                        {
-                            const auto & labseq = labels[0].find (key)->second;    // (we already checked above that it exists)
-                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
-                            if (abs ((int) labframes - (int) feat.cols()) > 0)
-                            {
-                                fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
-                                notfound++;
-                                continue;   // skip this utterance at all
-                            }
-                        }
-                        // append to cache
-                        frame.resize (featdim);
-                        if (feat.cols() < 2)    // (2 frames needed for boundary markers)
-                            throw std::runtime_error ("minibatchframesourcemulti: utterances < 2 frames not supported");
-                        foreach_column (t, feat)
-                        {
-                            foreach_index (k, frame)
-                                frame[k] = feat(k,t);
-
-                            pframes[m]->push_back (frame);
-                            numframes++;
-                            if (m==0)
-                                boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
-                        }
-                        if (m==0)
-                            framesaccum.push_back(numframes);
-                        else
-                            assert(numframes == framesaccum[i]);
-
-                        assert (numframes == pframes[m]->size());
-                    }
-                    if (m==0)
-                        assert (numframes == boundaryflags.size());
-
-
-
-                    if (m==0) // after we get the key for this file, read all labels (only done for first feature)
-                    { 
-                        if (!key.empty())
-                        {
-                            foreach_index (j, labels)
-                            {
-                                const auto & labseq = labels[j].find (key)->second;    // (we already checked above that it exists)
-                                foreach_index (i, labseq)
-                                {
-                                    const auto & e = labseq[i];
-                                    if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                                    {
-                                        if (e.classid >= udim[j])
-                                            throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: class id exceeds model dimension in file %S", key.c_str()));
-                                        if (e.classid != (CLASSIDTYPE) e.classid)
-                                            throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                                        classids[j].push_back ((CLASSIDTYPE) e.classid);
-                                        numclasses[j] = max (numclasses[j], 1u + e.classid);
-                                    }
-                                }
-                                if (vdim[m] == 0)
-                                    numframes = classids[j].size();
-                                if (numframes != classids[j].size())   // TODO: remove this once we are confident
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                                assert (numframes == classids[j].size());
-
-                            }
-                        }
-                        else
-                        {
-                            assert(classids.empty());
-                        }
-
-                    }
-
-                }
-
-
-                assert (vdim[m] == 0 || numframes == pframes[m]->size());
-
-                foreach_index(j, labels)
-                    assert (labels[j].empty() || numframes == classids[j].size());
-
-                if (vdim[m] != 0 && numframes != pframes[m]->size()) // || (!labels.empty() && numframes != classids.size()))
-                    throw std::runtime_error ("\nminibatchframesource: numframes variable screwup");
-                if (m==0)
-                {
-                    foreach_index (j, numclasses)
-                        fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %d classes\n", j, numclasses[j]);
-                }
-                fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %d frames read from %d utterances\n", m, pframes[m]->size(), infiles[m].size());
-                if (notfound > 0)
-                {
-                    fprintf (stderr, "minibatchframesourcemulti: %d files out of %d not found in label set\n", notfound, infiles[m].size());
-                    if (notfound > infiles[m].size() / 2)
-                        throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n");
-                }
-                // notify frames source to switch from population to consumption mode
-                pframes[m]->no_more_push_back();
-
-            }
-
-            if (numframes == 0 && !mayhavenoframe)
-                throw std::runtime_error ("minibatchframesource: no input features given!");
-
-
-            // initialize randomizer
-            if (numframes > 0) 
-                randomordering.resize (numframes, randomizationrange);
-
-        }
-        virtual ~minibatchframesourcemulti() {}
-        size_t totalframes() const { 
-            assert (maxvdim == 0 || numframes == pframes[0]->size()); assert (!issupervised() || numframes == classids[0].size()); return numframes; }
-
-        bool issupervised() const { return !classids.empty(); }
-
-        void setverbosity(int newverbosity) { verbosity = newverbosity; }
-
-        // retrieve one minibatch
-        // Minibatches are deterministic pseudo-random samples. The entire corpus
-        // is repeated infinitely, but each repetition (a 'sweep') is randomized
-        // differently.
-        // This function allows to retrieve a mini-batch starting from any frame
-        // within this infinitely extended repetition. To the end, mini-batches are
-        // specified by start frame and #frames.
-        // This function returns the same data independent on #frames, i.e. the concept
-        // of the mini-batch is not defined in here, but on the caller side. The caller
-        // can retrieve the frames of a mini-batch in chunks that do not match the
-        // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
-        // If a requested mini-batch spans a sweep boundary, then this function will
-        // not return samples after the sweep boundary. Instead, the returned frame
-        // set is shortened to not exceed the end of the sweep. The caller must make
-        // a separate second call to get the rest. In trainlayer(), the one
-        // sweep-boundary-spanning mini-batch will simply be shortened.
-        // This function is NOT thread-safe (due to caching of random sequence).
-        bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
-            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
-            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
-        {
-
-            auto_timer timergetbatch;
-            bool readfromdisk;
-            size_t nreadfromdisk=0;
-            transcripts.clear();    // word-level transcripts not supported by frame source (aimed at MMI)
-            latticepairs.clear();   // neither are lattices
-
-            assert (totalframes() > 0);
-            const size_t sweep = globalts / totalframes();  // which sweep (this determines randomization)
-            const size_t ts = globalts % totalframes();     // start frame within the sweep
-            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
-            assert (te > ts);
-            if (verbosity >= 2)
-                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
-
-            // get random sequence (each time index occurs exactly once)
-            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
-            const auto & tmap = randomordering (sweep);
-
-            feat.resize(pframes.size());
-            uids.resize(classids.size());
-            foreach_index(i, feat)
-            {
-                size_t leftextent, rightextent;
-                // page in the needed range of frames
-                if (leftcontext[i] == 0 && rightcontext[i] == 0)
-                {
-                    leftextent = rightextent = augmentationextent(pframes[i]->dim(), vdim[i]);
-                }
-                else
-                {
-                    leftextent = leftcontext[i];
-                    rightextent = rightcontext[i];
-                }
-                readfromdisk = pframes[i]->require (randomordering.bounds (max (ts, leftextent) - leftextent, te + 1 + rightextent));
-                // generate features and uids
-                feat[i].resize (vdim[i], te - ts);    // note: special mode vdim == 0 means no features to be loaded
-                if (issupervised())             // empty means unsupervised training -> return empty uids
-                    foreach_index(j, uids)
-                    uids[j].resize (te - ts);
-                else
-                    uids.clear();
-
-                for (size_t t = ts; t < te; t++)
-                {
-                    size_t trand = tmap[t];     // the random-sequence sample point for this point in time
-                    if (vdim[i] != 0)
-                    {
-                        auto v_t = feat[i].col(t-ts); // the vector to fill in
-                        augmentneighbors (*pframes[i], boundaryflags, trand, leftextent, rightextent, v_t);
-                    }
-                    if (i==0){ // read labels for all outputs on first pass thru features. this guarantees they will be read if only one feature set but > 1 label set
-                        if (issupervised())
-                            foreach_index(j, uids)
-                            uids[j][t-ts] = classids[j][trand];
-                    }
-                }
-                timegetbatch = timergetbatch;
-                if (readfromdisk)
-                    nreadfromdisk++;
-
-            }
-
-            (nreadfromdisk==feat.size()) ? readfromdisk = true : readfromdisk = false;
-
-            return readfromdisk;
-
-        }
-
-        bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector<size_t> & /*uids*/,
-            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/, 
-            std::vector<shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
-        {
-            // should never get here
-            throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchframesource instead\n");
-        }
-
-        double gettimegetbatch () { return timegetbatch;}
-
-        // return first valid globalts to ask getbatch() for
-        // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
-        /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
-
-        /*implement*/ const std::vector<size_t> & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); }
-
-    };
-};};
\ No newline at end of file
+//
+// <copyright file="rollingwindowsource.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file
+//
+
+#pragma once
+
+#include "basetypes.h"                  // for attempt()
+#include "numahelpers.h"                // for NUMA allocation
+#include "minibatchsourcehelpers.h"
+#include "minibatchiterator.h"
+#include "biggrowablevectors.h"
+#include "ssematrix.h"
+
+namespace msra { namespace dbn {
+
+    // ---------------------------------------------------------------------------
+    // biggrowablevectorarray -- a big array of vectors for features, growable (push_back)
+    // Data is striped across NUMA nodes, as to not clog them up.
+    // This also supports paging to disk, which is used for the old minibatchframesource.
+    // ---------------------------------------------------------------------------
+    class biggrowablevectorarray : public growablevectorbase<msra::dbn::matrix>
+    {
+        size_t m;           // dim
+
+        size_t inmembegin;  // range we have in memory, rounded to enclosing blocks (not rounded at end)
+        size_t inmemend;
+
+        wstring pagepath;   // path for paging, empty if no paging
+        auto_file_ptr f;    // file handle for paging
+        bool reading;       // have we begun reading?
+
+        // allocate a block
+        msra::dbn::matrix * newblock() const
+        {
+            // we stripe the data across NUMA nodes as to not fill up one node with the feature data
+            msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode());
+            msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock);
+            msra::numa::overridenode (-1);  // note: we really should reset it also in case of failure
+            return res;
+        }
+
+        // handling of page file
+        bool paging() const { return !pagepath.empty(); }
+        void openpagefile (bool wantread)
+        {
+            if (!paging()) return;
+            msra::files::make_intermediate_dirs (pagepath);
+
+            if (!wantread)
+            {
+                FILE *ftry = NULL;
+                wstring pathname (pagepath);
+                ftry = _wfopen (pathname.c_str(), L"wbS");
+                if (ftry) fclose (ftry);
+            }
+
+            /* 
+                code below to cycle through a-z appended to file name is no longer necessary 
+                since caller guarantees unique file names via HTKMLFReader 
+                and we want the pagepath logged to the user to be the actual one used by the code
+
+            // try to open the pagepath from a to z
+            if (!wantread)
+            {
+                FILE *ftry = NULL;
+                char trynum = 'a';
+                while (!ftry && trynum <= 'z')
+                {
+                    wstring pathname (pagepath);
+                    pathname += trynum++;
+                    ftry = _wfopen (pathname.c_str(), L"wbS");
+                }
+                if (ftry) fclose (ftry);
+                pagepath += --trynum;
+            }
+            */
+            f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS");
+            reading = wantread;
+        }
+        void flushlastblock()   // during population phase, must be called once per block in sequence
+        {
+            if (!paging()) return;
+            assert (!reading);
+            if (blocks.empty()) return;
+            const size_t blockid = blocks.size() -1;
+            msra::dbn::matrix & block = *blocks[blockid];
+            assert (fgetpos (f) == blockid * block.sizeinpagefile());
+            block.topagefile (f);
+            blocks[blockid].reset();    // free the memory
+            assert (blockid * elementsperblock == inmembegin);
+            inmembegin = inmemend;      // empty range
+        }
+        void releaseblock (size_t t0)   // t0=block start time
+        {
+            assert (paging() && reading);
+            size_t blockid = t0 / elementsperblock;
+            assert (blockid * elementsperblock == t0);
+            assert (blocks[blockid]);
+            fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
+            blocks[blockid].reset();    // free the memory
+        }
+        void recoverblock (size_t t0)   // t0=block start time
+        {
+            assert (paging() && reading);
+            size_t blockid = t0 / elementsperblock;
+            assert (blockid * elementsperblock == t0);
+            assert (!blocks[blockid]);
+            fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
+            blocks[blockid].reset (newblock());
+            msra::dbn::matrix & block = *blocks[blockid];
+            fsetpos (f, blockid * block.sizeinpagefile());
+            block.frompagefile (f);
+        }
+        
+    public:
+        biggrowablevectorarray (const wstring & pagepath)
+            : growablevectorbase (65536), m (0), 
+            inmembegin (0), inmemend (0), pagepath (pagepath), reading (false)
+        {
+            openpagefile (false);
+            if (paging())
+                fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str());
+        }
+        ~biggrowablevectorarray() { // clean up the big temp file 
+            if (paging()) {
+                fclose (f); 
+                if (_wunlink (pagepath.c_str())==0)
+                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
+                else
+                    fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str());
+            }
+        }            
+        
+        size_t dim() const { return m; }    // dimension of a frame
+
+        // reading phase
+        void push_back (const std::vector<float> & in)
+        {
+            assert (!in.empty());
+            assert (m == 0 || m == in.size());
+            m = in.size();
+            const size_t blockid = n / elementsperblock;
+            assert (blockid <= blocks.size());
+            if (blockid == blocks.size())   // a new block is needed
+            {
+                flushlastblock();
+                blocks.push_back (std::unique_ptr<msra::dbn::matrix> (newblock()));
+            }
+            const size_t blockn = n % elementsperblock;
+            msra::dbn::matrix & block = *blocks[blockid].get();
+            foreach_index (k, in)
+                block(k,blockn) = in[k];
+            n++;
+            inmemend = n;
+        }
+        void no_more_push_back()    // done pushing --switch to consumption mode
+        {
+            if (!paging()) return;
+            // finish off last block
+            flushlastblock();
+            fflushOrDie (f);
+            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f));
+            fclose (f);
+            foreach_index (i, blocks) assert (!blocks[i]);   // ensure we flushed
+            assert (inmembegin == inmemend);    // nothing in cache
+            // switch to reading mode
+            openpagefile (true);
+        }
+
+        // access phase
+        // Returns 'true' if data was actually read from disk.
+        bool require (pair<size_t,size_t> bounds) // we require this range of frames
+        {
+            bool readfromdisk = false;
+
+            // get bounds rounded to block boundaries
+            const size_t ts = bounds.first / elementsperblock * elementsperblock;
+            const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock);
+            assert (paging());
+            // free all the memmory
+            for (size_t t = inmembegin; t < inmemend; t += elementsperblock)
+            {
+                if (t >= ts && t < te)  // if in wanted range then skip to end of it
+                    t = te - elementsperblock;
+                else
+                    releaseblock (t);
+            }
+            // page in all required blocks
+            for (size_t t = ts; t < te; t += elementsperblock)
+            {
+                if (t >= inmembegin && t < inmemend)  // if in memory already then skip to end of it
+                    t = inmemend - elementsperblock;
+                else
+                {
+                    recoverblock (t);
+                    readfromdisk = true;            // tell caller we did something expensive
+                }
+            }
+            // got it
+            inmembegin = ts;
+            inmemend = te;
+            return readfromdisk;
+        }
+        const msra::dbn::matrixstripe operator[] (size_t t) const   // get a feature vector
+        {
+            if (t < inmembegin || t >= inmemend)
+                throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first");
+            const size_t blockt = getblockt (t);
+            /*const*/ msra::dbn::matrix & block = getblock (t);
+            return msra::dbn::matrixstripe (block, blockt, 1);
+        }
+        wstring pagepathname(){ return pagepath;}
+        void cleanuppagefile()
+        {
+            if (paging()) {
+                fclose (f); 
+                if (_wunlink (pagepath.c_str())==0){
+                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
+                }
+                else{
+                    fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str());
+                }
+            }
+        }
+    };
+
+    // ---------------------------------------------------------------------------
+    // minibatchframesource -- feature source to provide randomized frames in minibatches
+    // This is the old code that pages all frames to a huge disk file first.
+    // (The new minibatchutterancesource pages from input files directly and can also 
+    // operate in utterance mode for MMI training.)
+    // ---------------------------------------------------------------------------
+    class minibatchframesource : public minibatchsource
+    {
+        size_t vdim;                        // feature dimension after augmenting neighhors (0: don't read features)
+        unsigned int sampperiod;            // (for reference and to check against model)
+        string featkind;
+        size_t featdim;
+        // cache
+        biggrowablevectorarray frames;      // [t][i] all features concatenated
+        std::vector<char> boundaryflags;    // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
+        std::vector<CLASSIDTYPE> classids;  // [t] the state that the frame belongs to
+        size_t numframes;                   // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
+        msra::dbn::randomordering randomordering;  // [t] -> t'
+        double timegetbatch;
+        int verbosity;
+    public:
+        // constructor
+        // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
+        minibatchframesource (const std::vector<wstring> & infiles, const map<wstring,std::vector<msra::asr::htkmlfentry>> & labels,
+            size_t vdim, size_t udim, size_t randomizationrange, const wstring & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
+            : vdim (vdim), sampperiod (0), featdim (0), numframes (0), frames (pagepath), timegetbatch (0), verbosity(2)
+        {
+            if (vdim == 0 && labels.empty())
+                throw runtime_error ("minibatchframesource: when running without features, labels are needed");
+            // at this stage, we simply page in the entire training set at once and work off RAM
+            // We will benefit from feature archives indirectly through htkfeatio.
+            // TODO:
+            //  - infiles must specify time range
+            //  - at this stage only reserve() (we know the time range; allocate second-layer structure)
+            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
+            featkind.clear();
+            std::vector<float> frame;
+            fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size());
+            size_t numclasses = 0;              // number of units found (actually max id +1)
+            size_t notfound = 0;                // number of entries missing in MLF
+            msra::asr::htkfeatreader reader;    // feature reader
+            reader.AddEnergy(addEnergy);
+
+            foreach_index (i, infiles)
+            {
+                if (i % (infiles.size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
+                msra::basetypes::matrix<float> feat;
+                msra::asr::htkfeatreader::parsedpath ppath (infiles[i]);
+
+                // skip files for which labels don't exist (assuming bad alignment)
+                wstring key;
+                if (!labels.empty())    // empty means unsupervised mode (don't load any)
+                {
+                    key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
+                    if (labels.find (key) == labels.end())
+                    {
+                        if (notfound < 5)
+                            fprintf (stderr, "\nminibatchframesource: %d-th file not found in MLF label set: %S", i, key.c_str());
+                        notfound++;
+                        continue;   // skip this utterance at all
+                    }
+                }
+
+                // get feature frames
+                if (vdim != 0)  // (vdim == special mode to not read features at all)
+                {
+                    msra::util::attempt (5, [&]()
+                    {
+                        reader.read (ppath, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
+                    });
+                    if (featdim == 0)   // first time
+                        featdim = feat.rows();
+                    else if (featdim != feat.rows())
+                        throw std::runtime_error ("minibatchframesource: inconsistent feature dimension across files");
+                    // HVite occasionally generates mismatching output --skip such files
+                    if (!key.empty())   // (we have a key if supervised mode)
+                    {
+                        const auto & labseq = labels.find (key)->second;    // (we already checked above that it exists)
+                        size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                        if (abs ((int) labframes - (int) feat.cols()) > 0)
+                        {
+                            fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
+                            notfound++;
+                            continue;   // skip this utterance at all
+                        }
+                    }
+                    // append to cache
+                    frame.resize (featdim);
+                    if (feat.cols() < 2)    // (2 frames needed for boundary markers)
+                        throw std::runtime_error ("minibatchframesource: utterances < 2 frames not supported");
+                    foreach_column (t, feat)
+                    {
+                        foreach_index (k, frame)
+                            frame[k] = feat(k,t);
+                        frames.push_back (frame);
+                        numframes++;
+                        boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+                    }
+                    assert (numframes == frames.size());
+                    assert (numframes == boundaryflags.size());
+                }
+
+                // get label sequence
+                if (!key.empty())   // (we have a key if supervised mode)
+                {
+                    const auto & labseq = labels.find (key)->second;    // (we already checked above that it exists)
+                    foreach_index (i, labseq)
+                    {
+                        const auto & e = labseq[i];
+                        if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                            throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                        for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                        {
+                            if (e.classid >= udim)
+                                throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: class id exceeds model dimension in file %S", key.c_str()));
+                            if (e.classid != (CLASSIDTYPE) e.classid)
+                                throw std::runtime_error ("CLASSIDTYPE has too few bits");
+                            classids.push_back ((CLASSIDTYPE) e.classid);
+                            numclasses = max (numclasses, 1u + e.classid);
+                        }
+                    }
+                    if (vdim == 0)
+                        numframes = classids.size();
+                    if (numframes != classids.size())   // TODO: remove this once we are confident
+                        throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                    assert (numframes == classids.size());
+                }
+                else
+                {
+                    assert (classids.empty());  // that's how we detect it later
+                }
+            }
+            assert (vdim == 0 || numframes == frames.size());
+            assert (labels.empty() || numframes == classids.size());
+            if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size()))
+                throw std::runtime_error ("minibatchframesource: numframes variable screwup");
+            fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses);
+            if (notfound > 0)
+            {
+                fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size());
+                if (notfound > infiles.size() / 2)
+                    throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n");
+            }
+
+            if (numframes == 0 && !mayhavenoframe)
+                throw std::runtime_error ("minibatchframesource: no input features given!");
+
+            // notify frames source to switch from population to consumption mode
+            frames.no_more_push_back();
+
+            // initialize randomizer
+            if (numframes > 0) 
+                randomordering.resize (numframes, randomizationrange);
+        }
+        virtual ~minibatchframesource() {}
+        size_t totalframes() const { assert (vdim == 0 || numframes == frames.size()); assert (!issupervised() || numframes == classids.size()); return numframes; }
+
+        bool issupervised() const { return !classids.empty(); }
+
+        void setverbosity(int newverbosity) { verbosity = newverbosity; }
+
+        // retrieve one minibatch
+        // Minibatches are deterministic pseudo-random samples. The entire corpus
+        // is repeated infinitely, but each repetition (a 'sweep') is randomized
+        // differently.
+        // This function allows to retrieve a mini-batch starting from any frame
+        // within this infinitely extended repetition. To the end, mini-batches are
+        // specified by start frame and #frames.
+        // This function returns the same data independent on #frames, i.e. the concept
+        // of the mini-batch is not defined in here, but on the caller side. The caller
+        // can retrieve the frames of a mini-batch in chunks that do not match the
+        // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
+        // If a requested mini-batch spans a sweep boundary, then this function will
+        // not return samples after the sweep boundary. Instead, the returned frame
+        // set is shortened to not exceed the end of the sweep. The caller must make
+        // a separate second call to get the rest. In trainlayer(), the one
+        // sweep-boundary-spanning mini-batch will simply be shortened.
+        // This function is NOT thread-safe (due to caching of random sequence).
+        bool getbatch (const size_t globalts, const size_t framesrequested, msra::dbn::matrix & feat, std::vector<size_t> & uids,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
+        {
+            auto_timer timergetbatch;
+
+            transcripts.clear();    // word-level transcripts not supported by frame source (aimed at MMI)
+            latticepairs.clear();   // neither are lattices
+
+            assert (totalframes() > 0);
+            const size_t sweep = globalts / totalframes();  // which sweep (this determines randomization)
+            const size_t ts = globalts % totalframes();     // start frame within the sweep
+            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
+            assert (te > ts);
+            if (verbosity >= 2)
+                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
+
+            // get random sequence (each time index occurs exactly once)
+            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
+            const auto & tmap = randomordering (sweep);
+
+            // page in the needed range of frames
+            const size_t extent = augmentationextent (frames.dim(), vdim);
+            bool readfromdisk = frames.require (randomordering.bounds (max (ts, extent) - extent, te + 1 + extent));
+
+            // generate features and uids
+            feat.resize (vdim, te - ts);    // note: special mode vdim == 0 means no features to be loaded
+            if (issupervised())             // empty means unsupervised training -> return empty uids
+                uids.resize (te - ts);
+            else
+                uids.clear();
+            for (size_t t = ts; t < te; t++)
+            {
+                size_t trand = tmap[t];     // the random-sequence sample point for this point in time
+                if (vdim != 0)
+                {
+                    auto v_t = feat.col(t-ts); // the vector to fill in
+                    augmentneighbors (frames, boundaryflags, trand, v_t);
+                }
+                if (issupervised())
+                    uids[t-ts] = classids[trand];
+            }
+            timegetbatch = timergetbatch;
+            return readfromdisk;
+        }
+
+        bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
+        {
+            // for single input/output set size to be 1 and run old getbatch
+            feat.resize(1);
+            uids.resize(1);
+            //transcripts.resize(1);
+            //latticepairs.resize(1);
+            return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
+        }
+
+        double gettimegetbatch () { return timegetbatch;}
+
+        // return first valid globalts to ask getbatch() for
+        // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
+        /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
+
+        /*implement*/ const std::vector<size_t> & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); static std::vector<size_t> x; return x;/*keep compiler happy*/ }
+    };
+
+    // ---------------------------------------------------------------------------
+    // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches
+    // this is derived from minibatchframesource but worked with multiple inputs and/or outputs
+    // by making "frames" and "classids" a vector of vectors
+    // ---------------------------------------------------------------------------
+    class minibatchframesourcemulti : public minibatchsource
+    {
+        std::vector<size_t> vdim;                       // feature dimension after augmenting neighhors (0: don't read features)
+        std::vector<size_t> leftcontext;                // number of frames to the left of the target frame in the context window
+        std::vector<size_t> rightcontext;               // number of frames to the right of the target frame in the context window
+        unsigned int sampperiod;            // (for reference and to check against model)
+        string featkind;
+        size_t featdim;
+        size_t maxvdim;
+        // cache
+        //std::vector<biggrowablevectorarray> frames;
+        std::vector<unique_ptr<biggrowablevectorarray>> pframes;      // [t][i] all features concatenated
+        std::vector<char> boundaryflags;    // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
+        std::vector<std::vector<CLASSIDTYPE>> classids;  // [t] the state that the frame belongs to
+        size_t numframes;                   // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
+        msra::dbn::randomordering randomordering;  // [t] -> t'
+        double timegetbatch;
+        int verbosity;
+
+    public:
+        // constructor
+        // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
+        minibatchframesourcemulti (const std::vector<std::vector<wstring>> & infiles, const std::vector<map<std::wstring,std::vector<msra::asr::htkmlfentry>>> & labels,
+            std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange, const std::vector<wstring> & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
+            : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), numframes (0), timegetbatch (0), verbosity(2), maxvdim(0)
+        {
+
+            if (vdim[0] == 0 && labels.empty())
+                throw runtime_error ("minibatchframesourcemulti: when running without features, labels are needed");
+            // at this stage, we simply page in the entire training set at once and work off RAM
+            // We will benefit from feature archives indirectly through htkfeatio.
+            // TODO:
+            //  - infiles must specify time range
+            //  - at this stage only reserve() (we know the time range; allocate second-layer structure)
+            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
+            featkind.clear();
+            std::vector<float> frame;
+            std::vector<size_t>numclasses;              // number of units found (actually max id +1)
+            size_t notfound = 0;                // number of entries missing in MLF
+
+
+            std::vector<size_t>framesaccum;
+
+            if (infiles.size()==0)
+                throw runtime_error("minibatchframesourcemulti: need at least one network input specified with features");
+
+            if (labels.size()==0)
+                fprintf(stderr,"no MLF label files detected\n");
+ 
+            foreach_index (i, infiles)
+            {
+                pframes.push_back(unique_ptr<biggrowablevectorarray>(new biggrowablevectorarray(pagepath[i])));
+
+                if (vdim[i]>maxvdim)
+                    maxvdim=vdim[i];
+            }
+
+
+            foreach_index (i, labels)
+            {
+                classids.push_back(std::vector<CLASSIDTYPE>());
+                numclasses.push_back(0);
+            }
+
+
+            fprintf (stderr, "minibatchframesourcemulti: reading %d feature sets and %d label sets...", infiles.size(),labels.size());
+
+            foreach_index (m, infiles)
+            {
+
+
+                featdim=0;
+                numframes=0;
+                featkind.clear();
+                msra::asr::htkfeatreader reader;    // feature reader
+                reader.AddEnergy(addEnergy);
+
+                foreach_index (i, infiles[m]) // read each feature file in set m
+                {
+                    if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
+                    msra::basetypes::matrix<float> feat;
+                    msra::asr::htkfeatreader::parsedpath ppath (infiles[m][i]);
+
+                    // skip files for which labels don't exist (assuming bad alignment)
+                    wstring key;
+                    if (!labels.empty())
+                    {
+                        if (!labels[0].empty())    // empty means unsupervised mode (don't load any)
+                        {
+                            key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
+                            if (labels[0].find (key) == labels[0].end())
+                            {
+                                if (notfound < 5)
+                                    fprintf (stderr, "\nminibatchframesourcemulti: %d-th file not found in MLF label set: %S", i, key.c_str());
+                                notfound++;
+                                continue;   // skip this utterance at all
+                            }
+                        }
+                    }
+                    // get feature frames
+                    if (vdim[m] != 0)  // (vdim == special mode to not read features at all)
+                    {
+                        msra::util::attempt (5, [&]()
+                        {
+                            reader.read (ppath, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
+                        });
+                        if (featdim == 0)   // first time
+                            featdim = feat.rows();
+                        else if (featdim != feat.rows())
+                            throw std::runtime_error ("minibatchframesourcemulti: inconsistent feature dimension across files");
+                        // HVite occasionally generates mismatching output --skip such files
+                        if (!key.empty())   // (we have a key if supervised mode)
+                        {
+                            const auto & labseq = labels[0].find (key)->second;    // (we already checked above that it exists)
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            if (abs ((int) labframes - (int) feat.cols()) > 0)
+                            {
+                                fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
+                                notfound++;
+                                continue;   // skip this utterance at all
+                            }
+                        }
+                        // append to cache
+                        frame.resize (featdim);
+                        if (feat.cols() < 2)    // (2 frames needed for boundary markers)
+                            throw std::runtime_error ("minibatchframesourcemulti: utterances < 2 frames not supported");
+                        foreach_column (t, feat)
+                        {
+                            foreach_index (k, frame)
+                                frame[k] = feat(k,t);
+
+                            pframes[m]->push_back (frame);
+                            numframes++;
+                            if (m==0)
+                                boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+                        }
+                        if (m==0)
+                            framesaccum.push_back(numframes);
+                        else
+                            assert(numframes == framesaccum[i]);
+
+                        assert (numframes == pframes[m]->size());
+                    }
+                    if (m==0)
+                        assert (numframes == boundaryflags.size());
+
+
+
+                    if (m==0) // after we get the key for this file, read all labels (only done for first feature)
+                    { 
+                        if (!key.empty())
+                        {
+                            foreach_index (j, labels)
+                            {
+                                const auto & labseq = labels[j].find (key)->second;    // (we already checked above that it exists)
+                                foreach_index (i, labseq)
+                                {
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                    {
+                                        if (e.classid >= udim[j])
+                                            throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: class id exceeds model dimension in file %S", key.c_str()));
+                                        if (e.classid != (CLASSIDTYPE) e.classid)
+                                            throw std::runtime_error ("CLASSIDTYPE has too few bits");
+                                        classids[j].push_back ((CLASSIDTYPE) e.classid);
+                                        numclasses[j] = max (numclasses[j], 1u + e.classid);
+                                    }
+                                }
+                                if (vdim[m] == 0)
+                                    numframes = classids[j].size();
+                                if (numframes != classids[j].size())   // TODO: remove this once we are confident
+                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert (numframes == classids[j].size());
+
+                            }
+                        }
+                        else
+                        {
+                            assert(classids.empty());
+                        }
+
+                    }
+
+                }
+
+
+                assert (vdim[m] == 0 || numframes == pframes[m]->size());
+
+                foreach_index(j, labels)
+                    assert (labels[j].empty() || numframes == classids[j].size());
+
+                if (vdim[m] != 0 && numframes != pframes[m]->size()) // || (!labels.empty() && numframes != classids.size()))
+                    throw std::runtime_error ("\nminibatchframesource: numframes variable screwup");
+                if (m==0)
+                {
+                    foreach_index (j, numclasses)
+                        fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %d classes\n", j, numclasses[j]);
+                }
+                fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %d frames read from %d utterances\n", m, pframes[m]->size(), infiles[m].size());
+                if (notfound > 0)
+                {
+                    fprintf (stderr, "minibatchframesourcemulti: %d files out of %d not found in label set\n", notfound, infiles[m].size());
+                    if (notfound > infiles[m].size() / 2)
+                        throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n");
+                }
+                // notify frames source to switch from population to consumption mode
+                pframes[m]->no_more_push_back();
+
+            }
+
+            if (numframes == 0 && !mayhavenoframe)
+                throw std::runtime_error ("minibatchframesource: no input features given!");
+
+
+            // initialize randomizer
+            if (numframes > 0) 
+                randomordering.resize (numframes, randomizationrange);
+
+        }
+        virtual ~minibatchframesourcemulti() {}
+        size_t totalframes() const { 
+            assert (maxvdim == 0 || numframes == pframes[0]->size()); assert (!issupervised() || numframes == classids[0].size()); return numframes; }
+
+        bool issupervised() const { return !classids.empty(); }
+
+        void setverbosity(int newverbosity) { verbosity = newverbosity; }
+
+        // retrieve one minibatch
+        // Minibatches are deterministic pseudo-random samples. The entire corpus
+        // is repeated infinitely, but each repetition (a 'sweep') is randomized
+        // differently.
+        // This function allows to retrieve a mini-batch starting from any frame
+        // within this infinitely extended repetition. To the end, mini-batches are
+        // specified by start frame and #frames.
+        // This function returns the same data independent on #frames, i.e. the concept
+        // of the mini-batch is not defined in here, but on the caller side. The caller
+        // can retrieve the frames of a mini-batch in chunks that do not match the
+        // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
+        // If a requested mini-batch spans a sweep boundary, then this function will
+        // not return samples after the sweep boundary. Instead, the returned frame
+        // set is shortened to not exceed the end of the sweep. The caller must make
+        // a separate second call to get the rest. In trainlayer(), the one
+        // sweep-boundary-spanning mini-batch will simply be shortened.
+        // This function is NOT thread-safe (due to caching of random sequence).
+        bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
+        {
+
+            auto_timer timergetbatch;
+            bool readfromdisk;
+            size_t nreadfromdisk=0;
+            transcripts.clear();    // word-level transcripts not supported by frame source (aimed at MMI)
+            latticepairs.clear();   // neither are lattices
+
+            assert (totalframes() > 0);
+            const size_t sweep = globalts / totalframes();  // which sweep (this determines randomization)
+            const size_t ts = globalts % totalframes();     // start frame within the sweep
+            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
+            assert (te > ts);
+            if (verbosity >= 2)
+                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
+
+            // get random sequence (each time index occurs exactly once)
+            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
+            const auto & tmap = randomordering (sweep);
+
+            feat.resize(pframes.size());
+            uids.resize(classids.size());
+            foreach_index(i, feat)
+            {
+                size_t leftextent, rightextent;
+                // page in the needed range of frames
+                if (leftcontext[i] == 0 && rightcontext[i] == 0)
+                {
+                    leftextent = rightextent = augmentationextent(pframes[i]->dim(), vdim[i]);
+                }
+                else
+                {
+                    leftextent = leftcontext[i];
+                    rightextent = rightcontext[i];
+                }
+                readfromdisk = pframes[i]->require (randomordering.bounds (max (ts, leftextent) - leftextent, te + 1 + rightextent));
+                // generate features and uids
+                feat[i].resize (vdim[i], te - ts);    // note: special mode vdim == 0 means no features to be loaded
+                if (issupervised())             // empty means unsupervised training -> return empty uids
+                    foreach_index(j, uids)
+                    uids[j].resize (te - ts);
+                else
+                    uids.clear();
+
+                for (size_t t = ts; t < te; t++)
+                {
+                    size_t trand = tmap[t];     // the random-sequence sample point for this point in time
+                    if (vdim[i] != 0)
+                    {
+                        auto v_t = feat[i].col(t-ts); // the vector to fill in
+                        augmentneighbors (*pframes[i], boundaryflags, trand, leftextent, rightextent, v_t);
+                    }
+                    if (i==0){ // read labels for all outputs on first pass thru features. this guarantees they will be read if only one feature set but > 1 label set
+                        if (issupervised())
+                            foreach_index(j, uids)
+                            uids[j][t-ts] = classids[j][trand];
+                    }
+                }
+                timegetbatch = timergetbatch;
+                if (readfromdisk)
+                    nreadfromdisk++;
+
+            }
+
+            (nreadfromdisk==feat.size()) ? readfromdisk = true : readfromdisk = false;
+
+            return readfromdisk;
+
+        }
+
+        bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector<size_t> & /*uids*/,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
+        {
+            // should never get here
+            throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchframesource instead\n");
+        }
+
+        double gettimegetbatch () { return timegetbatch;}
+
+        // return first valid globalts to ask getbatch() for
+        // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
+        /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
+
+        /*implement*/ const std::vector<size_t> & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); }
+
+    };
+};};
diff --git a/DataReader/HTKMLFReader/utterancesource.h b/DataReader/HTKMLFReader/utterancesource.h
index 672cb72c4..982269667 100644
--- a/DataReader/HTKMLFReader/utterancesource.h
+++ b/DataReader/HTKMLFReader/utterancesource.h
@@ -768,6 +768,7 @@ private:
         if (chunkdata.isinram())
             return false;
 
+		if (verbosity)
         fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
         msra::util::attempt (5, [&]()   // (reading from network)
         {
@@ -858,6 +859,7 @@ public:
             transcripts.clear();
 
             // return these utterances
+			if (verbosity > 0)
             fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
             size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
             for (size_t pos = spos; pos < epos; pos++)
@@ -922,6 +924,7 @@ public:
             const size_t lastchunk = chunkforframepos (globalte-1);
             const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
             const size_t windowend = randomizedchunks[lastchunk].windowend;
+			if (verbosity > 0)
             fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                      globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
             // release all data outside, and page in all data inside
diff --git a/DataReader/HTKMLFReader/utterancesourcemulti.h b/DataReader/HTKMLFReader/utterancesourcemulti.h
index 1e97242a7..510e7bc32 100644
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource
         bool isinram() const { return !frames.empty(); }
         // page in data for this chunk
         // We pass in the feature info variables by ref which will be filled lazily upon first read
-        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
+        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
         {
             if (numutterances() == 0)
                 throw std::logic_error ("requiredata: cannot page in virgin block");
@@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                         latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                 }
                 //fprintf (stderr, "\n");
+				if (verbosity)
                 fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
             }
             catch (...)
@@ -568,6 +569,7 @@ private:
             return sweep;
 
         currentsweep = sweep;
+		if (verbosity>0)
         fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
 
         const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep
@@ -919,10 +921,11 @@ private:
             {
                 auto & chunk = randomizedchunks[m][chunkindex];
                 auto & chunkdata = chunk.getchunkdata();
+				if (verbosity)
                 fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                 msra::util::attempt (5, [&]()   // (reading from network)
                 {
-                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
+                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                 });
             }
             chunksinram++;
@@ -1029,7 +1032,8 @@ public:
                 }
             }
             // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
+			if (verbosity > 0)
+				fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
             size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
             for (size_t pos = spos; pos < epos; pos++)
             {
@@ -1107,6 +1111,7 @@ public:
             const size_t lastchunk = chunkforframepos (globalte-1);
             const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
             const size_t windowend = randomizedchunks[0][lastchunk].windowend;
+			if (verbosity)
             fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                      globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
             // release all data outside, and page in all data inside
@@ -1230,3 +1235,4 @@ public:
 };
 
 };};
+
diff --git a/DataReader/SequenceReader/SequenceParser.h b/DataReader/SequenceReader/SequenceParser.h
index 1226aaaf5..9115b2a00 100644
--- a/DataReader/SequenceReader/SequenceParser.h
+++ b/DataReader/SequenceReader/SequenceParser.h
@@ -1,616 +1,616 @@
-// SequenceParser.h : Parses the UCI format using a custom state machine (for speed)
-//
-//
-// <copyright file="SequenceParser.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#include <string>
-#include <vector>
-#include <assert.h>
-#include <fstream>
-#include <map>
-#include <stdint.h>
-
-using namespace std;
-
-#define MAXSTRING 2048
-// UCI label location types
-enum LabelMode
-{
-    LabelNone = 0,
-    LabelFirst = 1,
-    LabelLast = 2,
-};
-
-enum ParseMode
-{
-    ParseNormal = 0,
-    ParseLineCount = 1
-};
-
-enum SequenceFlags
-{
-    seqFlagNull = 0,
-    seqFlagLineBreak = 1, // line break on the parsed line
-    seqFlagEmptyLine = 2, // empty line
-    seqFlagStartLabel = 4,
-    seqFlagStopLabel = 8
-};
-
-// SequencePosition, save the ending indexes into the array for a sequence
-struct SequencePosition
-{
-    size_t numberPos; // max position in the number array for this sequence
-    size_t labelPos; // max position in the label array for this sequence
-    unsigned flags; // flags that apply to this sequence
-    SequencePosition(size_t numPos, size_t labelPos, unsigned flags):
-        numberPos(numPos), labelPos(labelPos), flags(flags)
-    {}
-};
-
-// SequenceParser - the parser for the UCI format files
-// for ultimate speed, this class implements a state machine to read these format files
-template <typename NumType, typename LabelType=int>
-class SequenceParser
-{
-protected:
-    enum ParseState
-    {
-        WholeNumber = 0,
-        Remainder = 1,
-        Exponent = 2,
-        Whitespace = 3,
-        Sign = 4,
-        ExponentSign = 5,
-        Period = 6,
-        TheLetterE = 7,
-        EndOfLine = 8, 
-        Label = 9, // any non-number things we run into
-        ParseStateMax = 10, // number of parse states
-        LineCountEOL = 10,
-        LineCountOther = 11,
-        AllStateMax = 12
-    };
-
-    // type of label processing
-    ParseMode m_parseMode;
-
-    // definition of label and feature dimensions
-    size_t m_dimFeatures;
-
-    size_t m_dimLabelsIn;
-    std::string m_beginSequenceIn; // starting sequence string (i.e. <s>)
-    std::string m_endSequenceIn; // ending sequence string (i.e. </s>)
-
-    size_t m_dimLabelsOut;
-    std::string m_beginSequenceOut; // starting sequence string (i.e. 'O')
-    std::string m_endSequenceOut; // ending sequence string (i.e. 'O')
-
-    // level of screen output
-    int m_traceLevel;
-
-    // current state of the state machine
-    ParseState m_current_state;
-
-    // state tables
-    DWORD *m_stateTable;
-
-    // numeric state machine variables
-    double m_partialResult;
-    double m_builtUpNumber;
-    double m_divider;
-    double m_wholeNumberMultiplier;
-    double m_exponentMultiplier;
-
-    // label state machine variables
-    size_t m_spaceDelimitedStart;
-    size_t m_spaceDelimitedMax; // start of the next whitespace sequence (one past the end of the last word)
-    int m_numbersConvertedThisLine;
-    int m_labelsConvertedThisLine;
-    int m_elementsConvertedThisLine;
-
-    // sequence state machine variables
-    bool m_beginSequence;
-    bool m_endSequence;
-    std::string m_beginTag;
-    std::string m_endTag;
-
-    // global stats
-    int m_totalNumbersConverted;
-    int m_totalLabelsConverted;
-
-    // file positions/buffer
-    FILE * m_pFile;
-    int64_t m_byteCounter;
-    int64_t m_fileSize;
-
-    BYTE * m_fileBuffer;
-    size_t m_bufferStart;
-    size_t m_bufferSize;
-
-    // last label was a string (for last label processing)
-    bool m_lastLabelIsString;
-
-    // vectors to append to
-    std::vector<NumType>* m_numbers; // pointer to vectors to append with numbers
-    std::vector<LabelType>* m_labels; // pointer to vector to append with labels (may be numeric)
-    // FUTURE: do we want a vector to collect string labels in the non string label case? (signifies an error)
-
-    // SetState for a particular value
-    void SetState(int value, ParseState m_current_state, ParseState next_state);
-
-    // SetStateRange - set states transitions for a range of values
-    void SetStateRange(int value1, int value2, ParseState m_current_state, ParseState next_state);
-
-    // SetupStateTables - setup state transition tables for each state
-    // each state has a block of 256 states indexed by the incoming character
-    void SetupStateTables();
-
-    // reset all line state variables
-    void PrepareStartLine();
-
-    // reset all number accumulation variables
-    void PrepareStartNumber();
-
-    // reset all state variables to start reading at a new position
-    void PrepareStartPosition(size_t position);
-
-    // UpdateBuffer - load the next buffer full of data
-    // returns - number of records read
-    size_t UpdateBuffer();
-
-public:
-
-    // SequenceParser constructor
-    SequenceParser();
-    // setup all the state variables and state tables for state machine
-    void Init();
-
-    // Parser destructor
-    ~SequenceParser();
-
-private:
-    // DoneWithLabel - Called when a string label is found
-    void DoneWithLabel();
-
-    // Called when a number is complete
-    void DoneWithValue();
-
-    // store label is specialized by LabelType
-    void StoreLabel(NumType value);
-
-    // StoreLastLabel - store the last label (for numeric types), tranfers to label vector
-    // string label types handled in specialization
-    void StoreLastLabel();
-
-public:
-    // SetParseMode - Set the parsing mode
-    // mode - set mode to either ParseLineCount, or ParseNormal
-    void SetParseMode(ParseMode mode);
-
-    // SetTraceLevel - Set the level of screen output
-    // traceLevel - traceLevel, zero means no output, 1 epoch related output, > 1 all output
-    void SetTraceLevel(int traceLevel);
-
-
-    // ParseInit - Initialize a parse of a file
-    // fileName - path to the file to open
-    // dimFeatures - number of features for precomputed features
-    // dimLabelsIn - number of lables possible on input
-    // dimLabelsOut - number of labels possible on output
-    // beginSequenceIn - beginSequence input label
-    // endSequenceIn - endSequence input label
-    // beginSequenceOut - beginSequence output label
-    // endSequenceOut - endSequence output label
-    // bufferSize - size of temporary buffer to store reads
-    // startPosition - file position on which we should start
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O", size_t bufferSize=1024*256, size_t startPosition=0)
-    {
-        assert(fileName != NULL);
-        m_dimFeatures = dimFeatures;
-        m_dimLabelsIn = dimLabelsIn;
-        m_beginSequenceIn = beginSequenceIn;
-        m_endSequenceIn = endSequenceIn;
-        m_dimLabelsOut = dimLabelsOut;
-        m_beginSequenceOut = beginSequenceOut;
-        m_endSequenceOut = endSequenceOut;
-
-        m_parseMode = ParseNormal;
-        m_traceLevel = 0;
-        m_bufferSize = bufferSize;
-        m_bufferStart = startPosition;
-
-        m_beginTag = m_beginSequenceIn;
-        m_endTag = m_endSequenceIn;
-
-        // if we have a file already open, cleanup
-        if (m_pFile != NULL)
-            SequenceParser<NumType, LabelType>::~SequenceParser();
-
-        errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
-        if (err)
-            RuntimeError("SequenceParser::ParseInit - error opening file"); 
-        int rc = _fseeki64(m_pFile, 0, SEEK_END);
-        if (rc)
-            RuntimeError("SequenceParser::ParseInit - error seeking in file");
-
-        m_fileSize = GetFilePosition();
-        m_fileBuffer = new BYTE[m_bufferSize];
-        SetFilePosition(startPosition);
-    }
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
-    {
-        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
-        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
-
-        // transfer to member variables
-        m_numbers = numbers;
-        m_labels = labels;
-
-        long TickStart = GetTickCount( );
-        long recordCount = 0;
-        long lineCount = 0;
-        size_t bufferIndex = m_byteCounter-m_bufferStart;
-        SequencePosition sequencePositionLast(0,0,seqFlagNull);
-        while (m_byteCounter < m_fileSize && recordCount < recordsRequested)
-        {
-            // check to see if we need to update the buffer
-            if (bufferIndex >= m_bufferSize)
-            {
-                UpdateBuffer();
-                bufferIndex = m_byteCounter-m_bufferStart;
-            }
-
-            char ch = m_fileBuffer[bufferIndex];
-
-            ParseState nextState = (ParseState)m_stateTable[(m_current_state<<8)+ch];
-
-            if( nextState <= Exponent )
-            {
-                m_builtUpNumber = m_builtUpNumber * 10 + (ch - '0');
-                // if we are in the decimal portion of a number increase the divider
-                if (nextState == Remainder)
-                    m_divider *= 10;
-            }
-
-            // only do a test on a state transition
-            if (m_current_state != nextState)
-            {
-                // System.Diagnostics.Debug.WriteLine("Current state = " + m_current_state + ", next state = " + nextState);
-
-                // if the nextState is a label, we don't want to do any number processing, it's a number prefixed string
-                if (nextState != Label)
-                {
-                    // do the numeric processing
-                    switch (m_current_state)
-                    {
-                    case TheLetterE:
-                        if (m_divider != 0) // decimal number
-                            m_partialResult += m_builtUpNumber / m_divider;
-                        else // integer
-                            m_partialResult = m_builtUpNumber;
-                        m_builtUpNumber = 0;
-                        break;
-                    case WholeNumber:
-                        // could be followed by a remainder, or an exponent
-                        if (nextState != TheLetterE)
-                            if( nextState != Period)
-                                DoneWithValue();
-                        if (nextState == Period)
-                        {
-                            m_partialResult = m_builtUpNumber;
-                            m_divider = 1;
-                            m_builtUpNumber = 0;
-                        }
-                        break;
-                    case Remainder:
-                        // can only be followed by a exponent
-                        if (nextState != TheLetterE)
-                            DoneWithValue();
-                        break;
-                    case Exponent:
-                        DoneWithValue();
-                        break;
-                    }
-                }
-
-                // label handling
-                switch (m_current_state)
-                {
-                case Label:
-                    DoneWithLabel();
-                    break;
-                case EndOfLine:
-                    if (seqPos)
-                    {
-                        SequencePosition sequencePos(numbers->size(), labels->size(), 
-                            m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-                        // add a sequence element to the list
-                        seqPos->push_back(sequencePos);
-                        sequencePositionLast = sequencePos;
-                    }
-                
-                    // end of sequence determines record separation
-                    if (m_endSequence)
-                        recordCount = (long)labels->size();
-
-                    PrepareStartLine();
-                    break;
-                case Whitespace:
-                    // this is the start of the next space delimited entity
-                    if (nextState != EndOfLine)
-                        m_spaceDelimitedStart = m_byteCounter;
-                    break;
-                }
-
-                // label handling for next state
-                switch (nextState)
-                {
-                // do sign processing on nextState, since we still have the character handy
-                case Sign:
-                    if (ch == '-')
-                        m_wholeNumberMultiplier = -1;
-                    break;
-                case ExponentSign:
-                    if (ch == '-')
-                        m_exponentMultiplier = -1;
-                    break;
-                // going into whitespace or endOfLine, so end of space delimited entity
-                case Whitespace:
-                    m_spaceDelimitedMax = m_byteCounter;
-                    // hit whitespace and nobody processed anything, so add as label
-                    //if (m_elementsConvertedThisLine == elementsProcessed)
-                    //    DoneWithLabel();
-                    break;
-                case EndOfLine:
-                    if (m_current_state != Whitespace)
-                    {
-                        m_spaceDelimitedMax = m_byteCounter;
-                        // hit whitespace and nobody processed anything, so add as label
-                        //if (m_elementsConvertedThisLine == elementsProcessed)
-                        //    DoneWithLabel();
-                    }
-                    // process the label at the end of a line
-                    //if (m_labelMode == LabelLast && m_labels != NULL)
-                    //{
-                    //    StoreLastLabel();
-                    //}
-                    // intentional fall-through
-                case LineCountEOL:
-                    lineCount++;  // done with another record
-                    if (m_traceLevel > 1)
-                    {
-                        // print progress dots
-                        if (recordCount % 100 == 0)
-                        {
-                            if (recordCount % 1000 == 0)
-                            {
-                                if (recordCount % 10000 == 0)
-                                {
-                                    fprintf(stderr, "#");
-                                }
-                                else
-                                {
-                                    fprintf(stderr, "+");
-                                }
-                            }
-                            else
-                            {
-                                fprintf(stderr, ".");
-                            }
-                        }
-                    }
-                    break;
-                case LineCountOther:
-                    m_spaceDelimitedStart = m_byteCounter;
-                    break;
-                }
-            }
-
-            m_current_state = nextState;
-
-            // move to next character
-            m_byteCounter++;
-            bufferIndex++;
-        } // while
-
-        // at the end of the file we may need to add an additional sequencePosition push
-        // this could probably be fixed by taking another pass through the loop above, but this is easier
-        if (seqPos)
-        {
-            SequencePosition sequencePos(numbers->size(), labels->size(), 
-                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-            // add the final sequence element if needed
-            if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos))
-            {
-                seqPos->push_back(sequencePos);
-            }
-        }
-
-        long TickStop = GetTickCount( );
-
-        long TickDelta = TickStop - TickStart;
-
-        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
-        return lineCount;
-    }
-
-
-    int64_t GetFilePosition();
-    void SetFilePosition(int64_t position);
-
-    // HasMoreData - test if the current dataset have more data
-    // returns - true if it does, false if not
-    bool HasMoreData();
-};
-
-// StoreLabel - string version gets last space delimited string and stores in labels vector
-template <>
-void SequenceParser<float, std::string>::StoreLabel(float finalResult);
-
-// DoneWithLabel - string version stores string label
-template <>
-void SequenceParser<float, std::string>::DoneWithLabel();
-
-// StoreLastLabel - string version
-template <>
-void SequenceParser<float, std::string>::StoreLastLabel();
-
-// NOTE: Current code is identical to float, don't know how to specialize with template parameter that only covers one parameter
-
-// StoreLabel - string version gets last space delimited string and stores in labels vector
-template <>
-void SequenceParser<double, std::string>::StoreLabel(double finalResult);
-
-// DoneWithLabel - string version stores string label
-template <>
-void SequenceParser<double, std::string>::DoneWithLabel();
-
-// StoreLastLabel - string version
-template <>
-void SequenceParser<double, std::string>::StoreLastLabel();
-
-/// language model sequence parser
-template <typename NumType, typename LabelType>
-class LMSequenceParser : public SequenceParser<NumType, LabelType>
-{
-protected:
-    FILE * mFile; 
-    std::wstring mFileName; 
-
-public:
-    LMSequenceParser() { 
-        mFile = nullptr; 
-    };
-    ~LMSequenceParser() { 
-        if (mFile) fclose(mFile); 
-    }
-
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O")
-    {
-        assert(fileName != NULL);
-        mFileName = fileName;
-        m_dimFeatures = dimFeatures;
-        m_dimLabelsIn = dimLabelsIn;
-        m_beginSequenceIn = beginSequenceIn;
-        m_endSequenceIn = endSequenceIn;
-        m_dimLabelsOut = dimLabelsOut;
-        m_beginSequenceOut = beginSequenceOut;
-        m_endSequenceOut = endSequenceOut;
-
-        m_parseMode = ParseNormal;
-        m_traceLevel = 0;
-        m_bufferSize = 0;
-        m_bufferStart = 0;
-
-        m_beginTag = m_beginSequenceIn;
-        m_endTag = m_endSequenceIn;
-
-        m_fileSize = -1;
-        m_fileBuffer = NULL;
-
-        if (mFile) fclose(mFile);
-
-        if (_wfopen_s(&mFile, fileName, L"rt") != 0)
-            RuntimeError("cannot open file %s", fileName);
-    }
-
-    void ParseReset()
-    {
-        if (mFile) fseek(mFile, 0, SEEK_SET);
-    }
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
-    {
-        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
-        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
-
-        // transfer to member variables
-        m_numbers = numbers;
-        m_labels = labels;
-
-        long TickStart = GetTickCount( );
-        long recordCount = 0;
-        long orgRecordCount = (long)labels->size();
-        long lineCount = 0;
-        SequencePosition sequencePositionLast(0,0,seqFlagNull);
-        /// get line
-        char ch2[MAXSTRING]; 
-        while (recordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
-        {
-            
-            string ch = ch2; 
-            std::vector<string> vstr; 
-            vstr = sep_string(ch, " ");
-            if (vstr.size() < 3) 
-                continue;
-
-            for (size_t i = 0; i < vstr.size(); i++)
-            {
-                labels->push_back(vstr[i]);
-            }
-            SequencePosition sequencePos(numbers->size(), labels->size(), 
-                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
-            // add a sequence element to the list
-            seqPos->push_back(sequencePos);
-            sequencePositionLast = sequencePos;
-
-            recordCount = (long)labels->size() - orgRecordCount;
-
-            lineCount ++;
-        } // while
-
-        long TickStop = GetTickCount( );
-
-        long TickDelta = TickStop - TickStart;
-
-        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
-        return lineCount;
-    }
-
-};
-
-typedef struct{
-    size_t sLen;
-    size_t sBegin;
-    size_t sEnd;
-} stSentenceInfo; 
-/// language model sequence parser
-template <typename NumType, typename LabelType>
-class LMBatchSequenceParser: public LMSequenceParser<NumType, LabelType>
-{
-public:
-    vector<stSentenceInfo> mSentenceIndex2SentenceInfo;
-
-public:
-    LMBatchSequenceParser() { };
-    ~LMBatchSequenceParser() { }
-
-    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O");
-
-    // Parse - Parse the data
-    // recordsRequested - number of records requested
-    // labels - pointer to vector to return the labels 
-    // numbers - pointer to vector to return the numbers 
-    // seqPos - pointers to the other two arrays showing positions of each sequence
-    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
-    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos);
-
-};
+// SequenceParser.h : Parses the UCI format using a custom state machine (for speed)
+//
+//
+// <copyright file="SequenceParser.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include <string>
+#include <vector>
+#include <assert.h>
+#include <fstream>
+#include <map>
+#include <stdint.h>
+
+using namespace std;
+
+#define MAXSTRING 500000
+// UCI label location types
+enum LabelMode
+{
+    LabelNone = 0,
+    LabelFirst = 1,
+    LabelLast = 2,
+};
+
+enum ParseMode
+{
+    ParseNormal = 0,
+    ParseLineCount = 1
+};
+
+enum SequenceFlags
+{
+    seqFlagNull = 0,
+    seqFlagLineBreak = 1, // line break on the parsed line
+    seqFlagEmptyLine = 2, // empty line
+    seqFlagStartLabel = 4,
+    seqFlagStopLabel = 8
+};
+
+// SequencePosition, save the ending indexes into the array for a sequence
+struct SequencePosition
+{
+    size_t numberPos; // max position in the number array for this sequence
+    size_t labelPos; // max position in the label array for this sequence
+    unsigned flags; // flags that apply to this sequence
+    SequencePosition(size_t numPos, size_t labelPos, unsigned flags):
+        numberPos(numPos), labelPos(labelPos), flags(flags)
+    {}
+};
+
+// SequenceParser - the parser for the UCI format files
+// for ultimate speed, this class implements a state machine to read these format files
+template <typename NumType, typename LabelType=int>
+class SequenceParser
+{
+protected:
+    enum ParseState
+    {
+        WholeNumber = 0,
+        Remainder = 1,
+        Exponent = 2,
+        Whitespace = 3,
+        Sign = 4,
+        ExponentSign = 5,
+        Period = 6,
+        TheLetterE = 7,
+        EndOfLine = 8, 
+        Label = 9, // any non-number things we run into
+        ParseStateMax = 10, // number of parse states
+        LineCountEOL = 10,
+        LineCountOther = 11,
+        AllStateMax = 12
+    };
+
+    // type of label processing
+    ParseMode m_parseMode;
+
+    // definition of label and feature dimensions
+    size_t m_dimFeatures;
+
+    size_t m_dimLabelsIn;
+    std::string m_beginSequenceIn; // starting sequence string (i.e. <s>)
+    std::string m_endSequenceIn; // ending sequence string (i.e. </s>)
+
+    size_t m_dimLabelsOut;
+    std::string m_beginSequenceOut; // starting sequence string (i.e. 'O')
+    std::string m_endSequenceOut; // ending sequence string (i.e. 'O')
+
+    // level of screen output
+    int m_traceLevel;
+
+    // current state of the state machine
+    ParseState m_current_state;
+
+    // state tables
+    DWORD *m_stateTable;
+
+    // numeric state machine variables
+    double m_partialResult;
+    double m_builtUpNumber;
+    double m_divider;
+    double m_wholeNumberMultiplier;
+    double m_exponentMultiplier;
+
+    // label state machine variables
+    size_t m_spaceDelimitedStart;
+    size_t m_spaceDelimitedMax; // start of the next whitespace sequence (one past the end of the last word)
+    int m_numbersConvertedThisLine;
+    int m_labelsConvertedThisLine;
+    int m_elementsConvertedThisLine;
+
+    // sequence state machine variables
+    bool m_beginSequence;
+    bool m_endSequence;
+    std::string m_beginTag;
+    std::string m_endTag;
+
+    // global stats
+    int m_totalNumbersConverted;
+    int m_totalLabelsConverted;
+
+    // file positions/buffer
+    FILE * m_pFile;
+    int64_t m_byteCounter;
+    int64_t m_fileSize;
+
+    BYTE * m_fileBuffer;
+    size_t m_bufferStart;
+    size_t m_bufferSize;
+
+    // last label was a string (for last label processing)
+    bool m_lastLabelIsString;
+
+    // vectors to append to
+    std::vector<NumType>* m_numbers; // pointer to vectors to append with numbers
+    std::vector<LabelType>* m_labels; // pointer to vector to append with labels (may be numeric)
+    // FUTURE: do we want a vector to collect string labels in the non string label case? (signifies an error)
+
+    // SetState for a particular value
+    void SetState(int value, ParseState m_current_state, ParseState next_state);
+
+    // SetStateRange - set states transitions for a range of values
+    void SetStateRange(int value1, int value2, ParseState m_current_state, ParseState next_state);
+
+    // SetupStateTables - setup state transition tables for each state
+    // each state has a block of 256 states indexed by the incoming character
+    void SetupStateTables();
+
+    // reset all line state variables
+    void PrepareStartLine();
+
+    // reset all number accumulation variables
+    void PrepareStartNumber();
+
+    // reset all state variables to start reading at a new position
+    void PrepareStartPosition(size_t position);
+
+    // UpdateBuffer - load the next buffer full of data
+    // returns - number of records read
+    size_t UpdateBuffer();
+
+public:
+
+    // SequenceParser constructor
+    SequenceParser();
+    // setup all the state variables and state tables for state machine
+    void Init();
+
+    // Parser destructor
+    ~SequenceParser();
+
+private:
+    // DoneWithLabel - Called when a string label is found
+    void DoneWithLabel();
+
+    // Called when a number is complete
+    void DoneWithValue();
+
+    // store label is specialized by LabelType
+    void StoreLabel(NumType value);
+
+    // StoreLastLabel - store the last label (for numeric types), tranfers to label vector
+    // string label types handled in specialization
+    void StoreLastLabel();
+
+public:
+    // SetParseMode - Set the parsing mode
+    // mode - set mode to either ParseLineCount, or ParseNormal
+    void SetParseMode(ParseMode mode);
+
+    // SetTraceLevel - Set the level of screen output
+    // traceLevel - traceLevel, zero means no output, 1 epoch related output, > 1 all output
+    void SetTraceLevel(int traceLevel);
+
+
+    // ParseInit - Initialize a parse of a file
+    // fileName - path to the file to open
+    // dimFeatures - number of features for precomputed features
+    // dimLabelsIn - number of lables possible on input
+    // dimLabelsOut - number of labels possible on output
+    // beginSequenceIn - beginSequence input label
+    // endSequenceIn - endSequence input label
+    // beginSequenceOut - beginSequence output label
+    // endSequenceOut - endSequence output label
+    // bufferSize - size of temporary buffer to store reads
+    // startPosition - file position on which we should start
+    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O", size_t bufferSize=1024*256, size_t startPosition=0)
+    {
+        assert(fileName != NULL);
+        m_dimFeatures = dimFeatures;
+        m_dimLabelsIn = dimLabelsIn;
+        m_beginSequenceIn = beginSequenceIn;
+        m_endSequenceIn = endSequenceIn;
+        m_dimLabelsOut = dimLabelsOut;
+        m_beginSequenceOut = beginSequenceOut;
+        m_endSequenceOut = endSequenceOut;
+
+        m_parseMode = ParseNormal;
+        m_traceLevel = 0;
+        m_bufferSize = bufferSize;
+        m_bufferStart = startPosition;
+
+        m_beginTag = m_beginSequenceIn;
+        m_endTag = m_endSequenceIn;
+
+        // if we have a file already open, cleanup
+        if (m_pFile != NULL)
+            SequenceParser<NumType, LabelType>::~SequenceParser();
+
+        errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
+        if (err)
+            RuntimeError("SequenceParser::ParseInit - error opening file"); 
+        int rc = _fseeki64(m_pFile, 0, SEEK_END);
+        if (rc)
+            RuntimeError("SequenceParser::ParseInit - error seeking in file");
+
+        m_fileSize = GetFilePosition();
+        m_fileBuffer = new BYTE[m_bufferSize];
+        SetFilePosition(startPosition);
+    }
+
+    // Parse - Parse the data
+    // recordsRequested - number of records requested
+    // labels - pointer to vector to return the labels 
+    // numbers - pointer to vector to return the numbers 
+    // seqPos - pointers to the other two arrays showing positions of each sequence
+    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
+    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
+    {
+        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
+        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
+
+        // transfer to member variables
+        m_numbers = numbers;
+        m_labels = labels;
+
+        long TickStart = GetTickCount( );
+        long recordCount = 0;
+        long lineCount = 0;
+        size_t bufferIndex = m_byteCounter-m_bufferStart;
+        SequencePosition sequencePositionLast(0,0,seqFlagNull);
+        while (m_byteCounter < m_fileSize && recordCount < recordsRequested)
+        {
+            // check to see if we need to update the buffer
+            if (bufferIndex >= m_bufferSize)
+            {
+                UpdateBuffer();
+                bufferIndex = m_byteCounter-m_bufferStart;
+            }
+
+            char ch = m_fileBuffer[bufferIndex];
+
+            ParseState nextState = (ParseState)m_stateTable[(m_current_state<<8)+ch];
+
+            if( nextState <= Exponent )
+            {
+                m_builtUpNumber = m_builtUpNumber * 10 + (ch - '0');
+                // if we are in the decimal portion of a number increase the divider
+                if (nextState == Remainder)
+                    m_divider *= 10;
+            }
+
+            // only do a test on a state transition
+            if (m_current_state != nextState)
+            {
+                // System.Diagnostics.Debug.WriteLine("Current state = " + m_current_state + ", next state = " + nextState);
+
+                // if the nextState is a label, we don't want to do any number processing, it's a number prefixed string
+                if (nextState != Label)
+                {
+                    // do the numeric processing
+                    switch (m_current_state)
+                    {
+                    case TheLetterE:
+                        if (m_divider != 0) // decimal number
+                            m_partialResult += m_builtUpNumber / m_divider;
+                        else // integer
+                            m_partialResult = m_builtUpNumber;
+                        m_builtUpNumber = 0;
+                        break;
+                    case WholeNumber:
+                        // could be followed by a remainder, or an exponent
+                        if (nextState != TheLetterE)
+                            if( nextState != Period)
+                                DoneWithValue();
+                        if (nextState == Period)
+                        {
+                            m_partialResult = m_builtUpNumber;
+                            m_divider = 1;
+                            m_builtUpNumber = 0;
+                        }
+                        break;
+                    case Remainder:
+                        // can only be followed by a exponent
+                        if (nextState != TheLetterE)
+                            DoneWithValue();
+                        break;
+                    case Exponent:
+                        DoneWithValue();
+                        break;
+                    }
+                }
+
+                // label handling
+                switch (m_current_state)
+                {
+                case Label:
+                    DoneWithLabel();
+                    break;
+                case EndOfLine:
+                    if (seqPos)
+                    {
+                        SequencePosition sequencePos(numbers->size(), labels->size(), 
+                            m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
+                        // add a sequence element to the list
+                        seqPos->push_back(sequencePos);
+                        sequencePositionLast = sequencePos;
+                    }
+                
+                    // end of sequence determines record separation
+                    if (m_endSequence)
+                        recordCount = (long)labels->size();
+
+                    PrepareStartLine();
+                    break;
+                case Whitespace:
+                    // this is the start of the next space delimited entity
+                    if (nextState != EndOfLine)
+                        m_spaceDelimitedStart = m_byteCounter;
+                    break;
+                }
+
+                // label handling for next state
+                switch (nextState)
+                {
+                // do sign processing on nextState, since we still have the character handy
+                case Sign:
+                    if (ch == '-')
+                        m_wholeNumberMultiplier = -1;
+                    break;
+                case ExponentSign:
+                    if (ch == '-')
+                        m_exponentMultiplier = -1;
+                    break;
+                // going into whitespace or endOfLine, so end of space delimited entity
+                case Whitespace:
+                    m_spaceDelimitedMax = m_byteCounter;
+                    // hit whitespace and nobody processed anything, so add as label
+                    //if (m_elementsConvertedThisLine == elementsProcessed)
+                    //    DoneWithLabel();
+                    break;
+                case EndOfLine:
+                    if (m_current_state != Whitespace)
+                    {
+                        m_spaceDelimitedMax = m_byteCounter;
+                        // hit whitespace and nobody processed anything, so add as label
+                        //if (m_elementsConvertedThisLine == elementsProcessed)
+                        //    DoneWithLabel();
+                    }
+                    // process the label at the end of a line
+                    //if (m_labelMode == LabelLast && m_labels != NULL)
+                    //{
+                    //    StoreLastLabel();
+                    //}
+                    // intentional fall-through
+                case LineCountEOL:
+                    lineCount++;  // done with another record
+                    if (m_traceLevel > 1)
+                    {
+                        // print progress dots
+                        if (recordCount % 100 == 0)
+                        {
+                            if (recordCount % 1000 == 0)
+                            {
+                                if (recordCount % 10000 == 0)
+                                {
+                                    fprintf(stderr, "#");
+                                }
+                                else
+                                {
+                                    fprintf(stderr, "+");
+                                }
+                            }
+                            else
+                            {
+                                fprintf(stderr, ".");
+                            }
+                        }
+                    }
+                    break;
+                case LineCountOther:
+                    m_spaceDelimitedStart = m_byteCounter;
+                    break;
+                }
+            }
+
+            m_current_state = nextState;
+
+            // move to next character
+            m_byteCounter++;
+            bufferIndex++;
+        } // while
+
+        // at the end of the file we may need to add an additional sequencePosition push
+        // this could probably be fixed by taking another pass through the loop above, but this is easier
+        if (seqPos)
+        {
+            SequencePosition sequencePos(numbers->size(), labels->size(), 
+                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
+            // add the final sequence element if needed
+            if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos))
+            {
+                seqPos->push_back(sequencePos);
+            }
+        }
+
+        long TickStop = GetTickCount( );
+
+        long TickDelta = TickStop - TickStart;
+
+        if (m_traceLevel > 2)
+            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+        return lineCount;
+    }
+
+
+    int64_t GetFilePosition();
+    void SetFilePosition(int64_t position);
+
+    // HasMoreData - test if the current dataset have more data
+    // returns - true if it does, false if not
+    bool HasMoreData();
+};
+
+// StoreLabel - string version gets last space delimited string and stores in labels vector
+template <>
+void SequenceParser<float, std::string>::StoreLabel(float finalResult);
+
+// DoneWithLabel - string version stores string label
+template <>
+void SequenceParser<float, std::string>::DoneWithLabel();
+
+// StoreLastLabel - string version
+template <>
+void SequenceParser<float, std::string>::StoreLastLabel();
+
+// NOTE: Current code is identical to float, don't know how to specialize with template parameter that only covers one parameter
+
+// StoreLabel - string version gets last space delimited string and stores in labels vector
+template <>
+void SequenceParser<double, std::string>::StoreLabel(double finalResult);
+
+// DoneWithLabel - string version stores string label
+template <>
+void SequenceParser<double, std::string>::DoneWithLabel();
+
+// StoreLastLabel - string version
+template <>
+void SequenceParser<double, std::string>::StoreLastLabel();
+
+/// language model sequence parser
+template <typename NumType, typename LabelType>
+class LMSequenceParser : public SequenceParser<NumType, LabelType>
+{
+protected:
+    FILE * mFile; 
+    std::wstring mFileName; 
+
+public:
+    LMSequenceParser() { 
+        mFile = nullptr; 
+    };
+    ~LMSequenceParser() { 
+        if (mFile) fclose(mFile); 
+    }
+
+    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O")
+    {
+        assert(fileName != NULL);
+        mFileName = fileName;
+        m_dimFeatures = dimFeatures;
+        m_dimLabelsIn = dimLabelsIn;
+        m_beginSequenceIn = beginSequenceIn;
+        m_endSequenceIn = endSequenceIn;
+        m_dimLabelsOut = dimLabelsOut;
+        m_beginSequenceOut = beginSequenceOut;
+        m_endSequenceOut = endSequenceOut;
+
+        m_parseMode = ParseNormal;
+        m_traceLevel = 0;
+        m_bufferSize = 0;
+        m_bufferStart = 0;
+
+        m_beginTag = m_beginSequenceIn;
+        m_endTag = m_endSequenceIn;
+
+        m_fileSize = -1;
+        m_fileBuffer = NULL;
+
+        if (mFile) fclose(mFile);
+
+        if (_wfopen_s(&mFile, fileName, L"rt") != 0)
+            RuntimeError("cannot open file %s", fileName);
+    }
+
+    void ParseReset()
+    {
+        if (mFile) fseek(mFile, 0, SEEK_SET);
+    }
+
+    // Parse - Parse the data
+    // recordsRequested - number of records requested
+    // labels - pointer to vector to return the labels 
+    // numbers - pointer to vector to return the numbers 
+    // seqPos - pointers to the other two arrays showing positions of each sequence
+    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
+    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos)
+    {
+        assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount);
+        assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount);
+
+        // transfer to member variables
+        m_numbers = numbers;
+        m_labels = labels;
+
+        long TickStart = GetTickCount( );
+        long recordCount = 0;
+        long orgRecordCount = (long)labels->size();
+        long lineCount = 0;
+        SequencePosition sequencePositionLast(0,0,seqFlagNull);
+        /// get line
+        char ch2[MAXSTRING]; 
+        while (recordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
+        {
+            
+            string ch = ch2; 
+            std::vector<string> vstr; 
+            vstr = sep_string(ch, " ");
+            if (vstr.size() < 3) 
+                continue;
+
+            for (size_t i = 0; i < vstr.size(); i++)
+            {
+                labels->push_back(vstr[i]);
+            }
+            SequencePosition sequencePos(numbers->size(), labels->size(), 
+                m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak);
+            // add a sequence element to the list
+            seqPos->push_back(sequencePos);
+            sequencePositionLast = sequencePos;
+
+            recordCount = (long)labels->size() - orgRecordCount;
+
+            lineCount ++;
+        } // while
+
+        long TickStop = GetTickCount( );
+
+        long TickDelta = TickStop - TickStart;
+
+        if (m_traceLevel > 2)
+            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+        return lineCount;
+    }
+
+};
+
+typedef struct{
+    size_t sLen;
+    size_t sBegin;
+    size_t sEnd;
+} stSentenceInfo; 
+/// language model sequence parser
+template <typename NumType, typename LabelType>
+class LMBatchSequenceParser: public LMSequenceParser<NumType, LabelType>
+{
+public:
+    vector<stSentenceInfo> mSentenceIndex2SentenceInfo;
+
+public:
+    LMBatchSequenceParser() { };
+    ~LMBatchSequenceParser() { }
+
+    void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="<s>", std::string endSequenceIn="</s>", std::string beginSequenceOut="O", std::string endSequenceOut="O");
+
+    // Parse - Parse the data
+    // recordsRequested - number of records requested
+    // labels - pointer to vector to return the labels 
+    // numbers - pointer to vector to return the numbers 
+    // seqPos - pointers to the other two arrays showing positions of each sequence
+    // returns - number of records actually read, if the end of file is reached the return value will be < requested records
+    long Parse(size_t recordsRequested, std::vector<LabelType> *labels, std::vector<NumType> *numbers, std::vector<SequencePosition> *seqPos);
+
+};
diff --git a/DataReader/SequenceReader/SequenceReader.cpp b/DataReader/SequenceReader/SequenceReader.cpp
index a95d9ee6d..cec66fd52 100644
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
@@ -1,2010 +1,2007 @@
-//
-// <copyright file="SequenceReader.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// SequenceReader.cpp : Defines the exported functions for the DLL application.
-//
-
-
-#include "stdafx.h"
-#define DATAREADER_EXPORTS  // creating the exports here
-#include "DataReader.h"
-#include "SequenceReader.h"
-#ifdef LEAKDETECT
-#include <vld.h> // leak detection
-#endif
-#include "fileutil.h"   // for fexists()
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// ReadLine - Read a line
-// readSample - sample to read in global sample space
-// returns - true if we successfully read a record, otherwise false
-template<class ElemType>
-bool SequenceReader<ElemType>::ReadRecord(size_t /*readSample*/)
-{
-    return false; // not used
-}
-
-// RecordsToRead - Determine number of records to read to populate record buffers
-// mbStartSample - the starting sample from which to read
-// tail - we are checking for possible remainer records to read (default false)
-// returns - true if we have more to read, false if we hit the end of the dataset
-template<class ElemType>
-size_t SequenceReader<ElemType>::RecordsToRead(size_t mbStartSample, bool tail)
-{
-    assert(mbStartSample >= m_epochStartSample);
-    // determine how far ahead we need to read
-    // need to read to the end of the next minibatch
-    size_t epochSample = mbStartSample;
-    epochSample %= m_epochSize;
-
-    // determine number left to read for this epoch
-    size_t numberToEpoch = m_epochSize - epochSample;
-    // we will take either a minibatch or the number left in the epoch
-    size_t numberToRead = min(numberToEpoch, m_mbSize);
-    if (numberToRead == 0 && !tail)
-        numberToRead = m_mbSize;
-
-    return numberToRead;
-}
-
-// GetIdFromLabel - get an Id from a Label
-// mbStartSample - the starting sample we are ensureing are good
-// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
-// returns - true if we have more to read, false if we hit the end of the dataset
-template<class ElemType>
-/*IDataReader<ElemType>::LabelIdType*/ unsigned SequenceReader<ElemType>::GetIdFromLabel(const std::string& labelValue, LabelInfo& labelInfo)
-{
-    auto found = labelInfo.mapLabelToId.find(labelValue);
-
-    // not yet found, add to the map
-    if (found == labelInfo.mapLabelToId.end())
-    {
-        labelInfo.mapLabelToId[labelValue] = labelInfo.idMax;
-        labelInfo.mapIdToLabel[labelInfo.idMax] = labelValue;
-        found = labelInfo.mapLabelToId.find(labelValue);
-        labelInfo.idMax++;
-    }
-    return found->second;
-}
-
-template<class ElemType>
-/*IDataReader<ElemType>::LabelIdType*/ bool SequenceReader<ElemType>::CheckIdFromLabel(const std::string& labelValue, const LabelInfo& labelInfo, unsigned & labelId)
-{
-    auto found = labelInfo.mapLabelToId.find(labelValue);
-
-    // not yet found, add to the map
-    if (found == labelInfo.mapLabelToId.end())
-    {
-        return false; 
-    }
-    labelId = found->second;
-    return true; 
-}
-
-// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested
-// mbStartSample - the starting sample we are starting with
-// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
-// returns - true if we have more to read, false if we hit the end of the dataset
-template<class ElemType>
-bool SequenceReader<ElemType>::EnsureDataAvailable(size_t mbStartSample, bool /*endOfDataCheck*/)
-{
-    assert(mbStartSample >= m_epochStartSample);
-    // determine how far ahead we need to read
-    // need to read to the end of the next minibatch
-    size_t epochSample = mbStartSample;
-    bool moreToRead = true;
-
-    size_t numberToRead = RecordsToRead(mbStartSample);
-
-    // check to see if we have the proper records read already
-    if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample)
-        return true;
-
-    // if we have another sequence already read and waiting, just return now
-    if (m_seqIndex < m_sequence.size())
-        return true;
-
-    m_seqIndex = 0;
-    m_mbStartSample = 0;
-    m_sequence.clear();
-    m_featureData.clear();
-    m_labelIdData.clear();
-
-    m_readNextSample = 0;
-    epochSample = 0; 
-
-    // now get the labels
-    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-
-    bool nextWord = false;
-    if (m_labelInfo[labelInfoOut].type == labelNextWord)
-    {
-        nextWord = true;
-    }
-    LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut];
-
-    //if (m_labelIdData.size() > epochSample)
-    //{
-    //    m_labelIdData.resize(epochSample);
-    //    m_labelData.resize(epochSample*labelInfo.dim);
-    //}
-
-    // see how many we already read
-    int sequencesRead = 0;
-    std::vector<ElemType> featureTemp;
-    std::vector<LabelType> labelTemp;
-    std::vector<SequencePosition> seqPos;
-    do
-    {
-        int numRead = m_parser.Parse(CACHE_BLOG_SIZE, &labelTemp, &featureTemp, &seqPos);
-        moreToRead = (numRead != 0);
-
-        // translate from the sparse parsed data format to the to the training format data
-        int label = 0;
-        bool bSentenceStart = false;
-        SequencePosition sposLast = SequencePosition(0,0,seqFlagNull);
-        for (int seq = 0; seq < numRead; seq++)
-        {
-            // check 
-            SequencePosition spos = seqPos[seq];
-            if (spos.labelPos == sposLast.labelPos && spos.numberPos == sposLast.numberPos)
-                continue;
-            sposLast = spos;
-
-            bSentenceStart = true; 
-
-            // loop through the labels for this entry
-            while (label < spos.labelPos)  /// need to minus one since 
-            {
-
-                // labelIn should be a category label 
-                LabelType labelValue = labelTemp[label++];
-
-                if (trim(labelValue).size() == 0)
-                    continue; // empty input
-
-                // check for end of sequence marker
-                if (!bSentenceStart && (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()) || ((label - 1 )% m_mbSize == 0) ))
-                {
-                    // ignore those cases where $</s> is put in the begining, because those are used for initialization purpose
-                    spos.flags |= seqFlagStopLabel;
-                    sequencesRead++;
-
-                    // create the seqence table
-                    m_sequence.push_back(epochSample);
-                    if ((m_sequence.size() == 1 ? epochSample : epochSample - m_sequence[m_sequence.size()-2]) > m_mbSize)
-                    {
-                        fprintf(stderr, "read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample);
-                        RuntimeError("read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample);
-                    }
-
-                    if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
-                        continue; /// ignore sentence ending
-                }
-
-                // to-do, should ignore <s>, check the sentence ending is </s> 
-                // need to remove <s> from the training set
-                // allocate and initialize the next chunck of featureData
-                if (labelIn.type == labelCategory)
-                {
-                    LabelIdType index = GetIdFromLabel(labelValue, labelIn);
-
-                    // use the found value, and set the appropriate location to a 1.0
-                    assert(labelIn.dim > index); // if this goes off labelOut dimension is too small
-                    m_featureData.push_back((float)index);
-                }
-                else
-                {
-                    RuntimeError("Input label expected to be a category label");
-                }
-
-                // if we have potential features
-                if (m_featureDim > 0)
-                {
-                    RuntimeError("to-do. Assume sparse input feature. need to change the code from dense matrix");
-                    // move the position up to the start of the additional features section
-/*                    pos += labelIn.dim;
-                    assert(pos + m_featureDim == m_featureData.size());
-                    // this has to be an even number, a pair of index and value
-                    if  ((spos.numberPos&1) != 0)
-                        RuntimeError("Features must be specified in pairs (index:value). Invalid features for label '%s'\n", labelValue);
-                
-                    while (feature < spos.numberPos)
-                    {
-                        int index = (int)featureTemp[feature++];
-                        if (index < 0 || index >= m_featureDim)
-                            RuntimeError("Invalid feature index: %d for label '%s', feature max dimension = %lld\n", index, labelValue, m_featureDim);
-
-                        ElemType value = featureTemp[feature++];
-                        m_featureData[pos+index] = value;
-                    }
-                    */
-                }
-
-                // now get the output label
-                if (m_labelInfo[labelInfoOut].type == labelCategory)
-                {
-                    labelValue = labelTemp[label++];
-                }
-                else if (nextWord)
-                {
-                    // this is the next word (label was incremented above)
-                    labelValue = labelTemp[label];
-                    if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
-                    {
-                        labelValue = labelInfo.endSequence;
-                    }
-                }
-                else
-                {
-                    RuntimeError("Invalid output label type, expected Category, or Next Word");
-                }
-
-                // get the ID from the label
-                LabelIdType id = GetIdFromLabel(labelValue, labelInfo);
-                m_labelIdData.push_back(id);
-
-                m_readNextSample++;
-                epochSample++;
-                if (!m_endReached)
-                    m_totalSamples++;   // add to the total number of records in the dataset
-
-                bSentenceStart = false;
-            }
-
-            {
-                // check if the reading is right
-                int jEnd = (int) m_labelIdData.size() - 1; 
-                LabelIdType index ;
-                if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false)
-                    RuntimeError("cannot find sentence begining label");
-
-                if (m_labelIdData[jEnd] != index )
-                     /// for language model, the first word/letter has to be <s>
-                    RuntimeError("SequenceReader: the last letter/word of a batch has to be the sentence ending symbol");
-            }
-
-        }
-
-        m_readNextSampleLine += numRead;
-    } 
-    while (sequencesRead < 1 && moreToRead); // we need to read at least one sequence or have no more data
-
-    // if we read to the end, update appropriate variables
-    if (!moreToRead)
-    {
-        UpdateDataVariables();
-    }
-
-    // if there more to read 
-    return moreToRead;
-}
-
-// UpdateDataVariables - Update variables that depend on the dataset being completely read
-template<class ElemType>
-void SequenceReader<ElemType>::UpdateDataVariables()
-{
-    // if we haven't been all the way through the file yet
-    if (!m_endReached)
-    {
-        // get the size of the dataset
-        assert(m_totalSamples*m_featureCount >= m_featureData.size());
-
-        // if they want us to determine epoch size based on dataset size, do that
-        if (m_epochSize == requestDataSize)
-        {
-            m_epochSize = m_totalSamples;
-        }
-
-        WriteLabelFile();
-
-        // we got to the end of the dataset
-        m_endReached = true;
-    }
-
-    // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
-    for (int index = labelInfoMin; index < labelInfoMax; ++index)
-    {
-        if (m_labelInfo[index].type == labelCategory && m_labelInfo[index].idMax > m_labelInfo[index].dim)
-            m_labelInfo[index].dim = m_labelInfo[index].idMax;  // update the label dimensions if different
-    }
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::WriteLabelFile()
-{
-    // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
-    for (int index = labelInfoMin; index < labelInfoMax; ++index)
-    {
-        LabelInfo& labelInfo = m_labelInfo[index];
-
-        // write out the label file if they don't have one
-        if (!labelInfo.fileToWrite.empty())
-        {
-            if (labelInfo.mapIdToLabel.size() > 0)
-            {
-                File labelFile(labelInfo.fileToWrite, fileOptionsWrite | fileOptionsText);
-                for (int i=0; i < labelInfo.mapIdToLabel.size(); ++i)
-                {
-                    labelFile << labelInfo.mapIdToLabel[i] << '\n';
-                }
-                labelInfo.fileToWrite.clear();
-            }
-            else if (!m_cachingWriter)
-            {
-                fprintf(stderr, "WARNING: file %ws NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str());
-            }
-        }
-    }
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::LoadLabelFile(const std::wstring &filePath, std::vector<LabelType>& retLabels)
-{
-    File file(filePath, fileOptionsRead);
-
-    // initialize with file name
-    std::string path = msra::strfun::utf8(filePath);
-    auto location = path.find_last_of("/\\");
-    if (location != npos)
-        path = path.substr(location+1);
-    
-    // read the entire file into a string
-    string str;
-    retLabels.resize(0);
-    while (!file.IsEOF())
-    {
-        file.GetLine(str);
-
-        // check for a comment line
-        string::size_type pos = str.find_first_not_of(" \t");
-        if (pos != -1)
-        {
-            retLabels.push_back((LabelType)trim(str));
-        }
-    }
-}
-
-
-// Destroy - cleanup and remove this class
-// NOTE: this destroys the object, and it can't be used past this point
-template<class ElemType>
-void SequenceReader<ElemType>::Destroy()
-{
-    delete this;
-}
-
-// Init - Reader Initialize for multiple data sets
-// config - [in] configuration parameters for the datareader
-// Sample format below:
-//# Parameter values for the reader
-//reader=[
-//  # reader to use
-//  readerType=SequenceReader
-//  randomize=None
-// # additional features dimension
-//  featureDim=784
-//  file=c:\data\sequence\sequence.txt
-//  labelIn=[
-//    dim=26
-//      labelMappingFile=c:\data\sequence\alphabet.txt
-//      labelType=Category
-//    beginSequence="<s>"
-//    endSequence="</s>"
-//  ]
-//  labelOut=[
-//    dim=129
-//      labelMappingFile=c:\data\sequence\phonemes.txt
-//      labelType=Category
-//    beginSequence="O"
-//    endSequence="O"
-//  ]
-//]
-template<class ElemType>
-void SequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)
-{
-    // See if the user wants caching
-    m_cachingReader = NULL;
-    m_cachingWriter = NULL;
-
-    // NOTE: probably want to re-enable at some point
-
-    // initialize the cache
-    //InitCache(readerConfig);
-    //m_readerConfig = readerConfig;
-
-    //// if we have a cache, no need to parse the test files...
-    //if (m_cachingReader)
-    //    return;
-
-    std::vector<std::wstring> features;
-    std::vector<std::wstring> labels;
-    GetFileConfigNames(readerConfig, features, labels);
-    if (features.size() > 0)
-    {
-        m_featuresName = features[0];
-    }
-
-    if (labels.size() == 2)
-    {
-        for (int index = labelInfoMin; index < labelInfoMax; ++index)
-        {
-            m_labelsName[index] = labels[index];
-        }
-    }
-    else
-        RuntimeError("two label definitions (in and out) required for Sequence Reader");
-
-    ConfigParameters featureConfig = readerConfig(m_featuresName,"");
-    ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")};
-
-    class_size = 0;
-    m_featureDim = featureConfig("dim");
-    for (int index = labelInfoMin; index < labelInfoMax; ++index)
-    {
-        m_labelInfo[index].idMax = 0; 
-        m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", "");
-        m_labelInfo[index].endSequence = labelConfig[index]("endSequence", "");
-
-        // determine label type desired
-        std::string labelType(labelConfig[index]("labelType","Category"));
-        if (labelType == "Category")
-        {
-            m_labelInfo[index].type = labelCategory;
-        }
-        else if (labelType == "NextWord")
-        {
-            // in this case, it's all identical to the Input labels, except the data type
-            m_labelInfo[index].type = labelNextWord;
-            m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim;
-        }
-        else if (labelType == "None")
-        {
-            m_labelInfo[index].type = labelNone;
-            m_labelInfo[index].dim = 0;   // override for no labels
-        }
-
-        // if we have labels, we need a label Mapping file, it will be a file with one label per line
-        if (m_labelInfo[index].type != labelNone)
-        {
-            std::wstring wClassFile = readerConfig("wordclass", "");
-            nwords = labelConfig[index]("labelDim");
-            if (wClassFile != L""){
-                ReadClassInfo(wClassFile  , false);
-            }
-
-            std::vector<string> arrayLabels;
-            std::wstring labelPath = labelConfig[index]("labelMappingFile");
-            if (fexists(labelPath))
-            {
-                LoadLabelFile(labelPath, arrayLabels);
-                for (int i=0; i < arrayLabels.size(); ++i)
-                {
-                    LabelType label = arrayLabels[i];
-                    m_labelInfo[index].mapIdToLabel[i] = label;
-                    m_labelInfo[index].mapLabelToId[label] = i;
-                }
-                m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size();
-                m_labelInfo[index].mapName = labelPath;
-            }
-            else
-            {
-                if (wClassFile != L""){
-                    ReadClassInfo(wClassFile  , false);
-                    int iMax = -1, i; 
-                    for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++)
-                    {
-                        LabelType label = ptr->first; 
-                        i = ptr->second; 
-                        iMax = max(i, iMax);
-                        m_labelInfo[index].mapIdToLabel[i] = label;
-                        m_labelInfo[index].mapLabelToId[label] = i;
-                    }
-                    m_labelInfo[index].idMax = (LabelIdType)(iMax+1);
-
-                    OrganizeClass();
-
-                }
-                m_labelInfo[index].mapName = labelPath;
-
-                m_labelInfo[index].fileToWrite = labelPath;
-            }
-        }
-
-        m_labelInfo[index].dim = labelConfig[index]("labelDim");
-
-        // update dimension if the file says it's bigger
-        if (m_labelInfo[index].dim < m_labelInfo[index].idMax)
-        {
-            m_labelInfo[index].dim = m_labelInfo[index].idMax;
-        }
-    }
-
-    // initialize all the variables
-    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0;
-    m_endReached = false;
-    m_readNextSampleLine = 0;
-    m_readNextSample = 0;
-    m_traceLevel = readerConfig("traceLevel","0");
-    m_parser.SetTraceLevel(m_traceLevel);
-
-    if (readerConfig.Exists("randomize"))
-    {
-        string randomizeString = readerConfig("randomize");
-        if (randomizeString == "None")
-        {
-            ;
-        }
-        else if (randomizeString == "Auto")
-        {
-            ;
-        }
-        else
-        {
-            ;//readerConfig("randomize");
-        }
-    }
-    else
-    {
-        ; //randomizeAuto;
-    }
-
-    // The input data is a combination of the label Data and extra feature dims together
-//    m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim;
-    m_featureCount = 1; 
-
-    std::wstring m_file = readerConfig("file");
-    if (m_traceLevel > 0)
-        fprintf(stderr, "reading sequence file %ws\n", m_file.c_str());
-
-    const LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-    const LabelInfo& labelOut = m_labelInfo[labelInfoOut];
-    m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence);
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::ReadWord(char *word, FILE *fin)
-{
-    int a=0, ch;
-
-    while (!feof(fin)) {
-        ch=fgetc(fin);
-
-        if (ch==13) continue;
-
-        if ((ch==' ') || (ch=='\t') || (ch=='\n')) {
-            if (a>0) {
-                if (ch=='\n') ungetc(ch, fin);
-                break;
-            }
-
-            if (ch=='\n') {
-                strcpy_s(word, strlen("</s>"), (char *)"</s>");
-                return;
-            }
-            else continue;
-        }
-
-        word[a]=(char)ch;
-        a++;
-
-        if (a>=MAX_STRING) {
-            //printf("Too long word found!\n");   //truncate too long words
-            a--;
-        }
-    }
-    word[a]=0;
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::ReadClassInfo(const wstring & vocfile, bool /*flatten*/) 
-{
-    char strFileName[MAX_STRING];
-    char stmp[MAX_STRING];
-    string strtmp; 
-    size_t sz;
-    int cnt, clsidx, b;
-    class_size  = 0;
-
-    wcstombs_s(&sz, strFileName, 2048, vocfile.c_str(), vocfile.length());
-
-    FILE * vin;
-    vin = fopen(strFileName, "rt") ;
-
-    if (vin == nullptr)
-    {
-        RuntimeError("cannot open word class file");
-    }
-    for (int a = 0; a < nwords; a++)
-    {
-        fscanf_s(vin, "%6d\t%10d\t", &b, &cnt);
-        ReadWord(stmp, vin);
-        fscanf_s(vin, "%d\t\n", &clsidx);
-        strtmp = stmp;
-        idx4cnt[b] = cnt;
-        word4idx[strtmp] = b;
-        idx4word[b]= strtmp;
-        
-        idx4class[b] = clsidx;
-        class_size = max(class_size, clsidx);
-    }
-    fclose(vin);
-
-    class_size ++;
-}
-
-// InitCache - Initialize the caching reader if cache files exist, otherwise the writer
-// readerConfig - reader configuration
-template<class ElemType>
-void SequenceReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
-{
-    // check for a writer tag first (lets us know we are caching)
-    if (!readerConfig.Exists("writerType"))
-        return;
-
-    // first try to open the binary cache
-    bool found = false;
-    try
-    {
-        // TODO: need to go down to all levels, maybe search for sectionType
-        ConfigArray filesList(',');
-        vector<std::wstring> names;
-        if (readerConfig.Exists("wfile"))
-        {
-            filesList.push_back(readerConfig("wfile"));
-            if (fexists(readerConfig("wfile")))
-                found = true;
-        }
-        FindConfigNames(readerConfig, "wfile", names);
-        for (auto name : names)
-        {
-            ConfigParameters config = readerConfig(name);
-            filesList.push_back(config("wfile"));
-            if (fexists(config("wfile")))
-                found = true;
-        }
-
-        // if we have a file already, we are going to read the cached files
-        if (found)
-        {
-            ConfigParameters config;
-            readerConfig.CopyTo(config);
-            // mmodify the config so the reader types look correct
-            config["readerType"] = config("writerType");
-            config["file"] = filesList;
-            m_cachingReader = new DataReader<ElemType>(config);
-        }
-        else
-        {
-            m_cachingWriter = new DataWriter<ElemType>(readerConfig);
-
-            // now get the section names for map and category types
-            std::map<std::wstring, SectionType, nocase_compare> sections;
-            m_cachingWriter->GetSections(sections);
-            for (auto pair : sections)
-            {
-                // TODO: we would need to add a sequenceMap type here as well
-                // or maybe change to heirarchal name (i.e. root.labelIn.map)
-                if (pair.second == sectionTypeCategoryLabel)
-                {
-                    m_labelsCategoryName[labelInfoOut] = pair.first;
-                }
-                else if (pair.second == sectionTypeLabelMapping)
-                {
-                    m_labelsMapName[labelInfoOut] = pair.first;
-                }
-            }
-        }
-    }
-    catch (runtime_error err)
-    {
-        fprintf(stderr,"Error attemping to create Binary%s\n%s\n",found?"Reader":"Writer",err.what());
-        delete m_cachingReader;
-        m_cachingReader = NULL;
-        delete m_cachingWriter;
-        m_cachingWriter = NULL;
-    }
-    catch (...)
-    {
-        // if there is any error, just get rid of the object
-        fprintf(stderr,"Error attemping to create Binary%s\n",found?"Reader":"Writer");
-        delete m_cachingReader;
-        m_cachingReader = NULL;
-        delete m_cachingWriter;
-        m_cachingWriter = NULL;
-    }
-}
-
-// destructor - virtual so it gets called properly 
-template<class ElemType>
-SequenceReader<ElemType>::~SequenceReader()
-{
-    ReleaseMemory();
-    delete m_cachingReader;
-    delete m_cachingWriter;
-}
-
-// ReleaseMemory - release the memory footprint of SequenceReader
-// used when the caching reader is taking over
-template<class ElemType>
-void SequenceReader<ElemType>::ReleaseMemory()
-{
-    if (m_featuresBuffer!=NULL)
-        delete[] m_featuresBuffer;
-    m_featuresBuffer=NULL;
-    if (m_labelsBuffer!=NULL)
-        delete[] m_labelsBuffer;
-    m_labelsBuffer=NULL;
-    if (m_labelsIdBuffer!=NULL)
-        delete[] m_labelsIdBuffer;
-    m_labelsIdBuffer=NULL;
-    m_featureData.clear();
-    m_labelIdData.clear();
-    m_labelData.clear();
-    m_sequence.clear();
-}
-
-//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch
-template<class ElemType>
-void SequenceReader<ElemType>::SetupEpoch()
-{
-    // if we are starting fresh (epoch zero and no data read), init everything
-    // however if we are using cachingWriter, we need to know record count, so do that first
-    if (m_epoch == 0 && m_totalSamples == 0 && m_cachingWriter == NULL)
-    {
-        m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0;
-        m_parser.SetFilePosition(0);
-    }
-    else  // otherwise, position the read to start at the right location
-    {
-        m_seqIndex = 0;
-        // don't know the total number of samples yet, so count them
-        if (m_totalSamples == 0)
-        {
-            if (m_traceLevel > 0)
-                fprintf(stderr, "starting at epoch %d parsing all data to determine record count\n", m_epoch);
-            // choose a large number to read
-            m_parser.SetFilePosition(0);
-            m_mbStartSample = 0;
-            while (EnsureDataAvailable(m_mbStartSample))
-            {
-                m_mbStartSample = m_totalSamples;
-                m_seqIndex = m_sequence.size();
-            }
-            if (m_traceLevel > 0)
-                fprintf(stderr, "\n %lld records found\n", m_totalSamples);
-        }
-        m_seqIndex = 0;
-
-        // we have a slight delima here, if we haven't determined the end of the file yet
-        // and the user told us to find how many records are in the file, we can't distinguish "almost done"
-        // with a file (a character away) and the middle of the file. So read ahead a record to see if it's there.
-        bool endReached = m_endReached;
-        if (!endReached)
-        {
-            if (!m_parser.HasMoreData())
-            {
-                endReached = true;
-                UpdateDataVariables();
-                assert(m_endReached);
-            }
-        }
-
-        // always start from the first sample
-        m_epochStartSample = m_mbStartSample = 0;
-    }
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::LMSetupEpoch()
-{
-    m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0;
-}
-
-// utility function to round an integer up to a multiple of size
-size_t RoundUp(size_t value, size_t size) 
-{
-    return ((value + size -1)/size)*size;
-}
-
-//StartMinibatchLoop - Startup a minibatch loop 
-// mbSize - [in] size of the minibatch (number of Samples, etc.)
-//     NOTE: for sequence data, this will be the MAX size of a sequence, as every sequence could be a different length
-// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run)
-// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
-template<class ElemType>
-void SequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
-{
-    // if we aren't currently caching, see if we can use a cache
-    if (!m_cachingReader && !m_cachingWriter)
-    {
-        InitCache(m_readerConfig);
-        if (m_cachingReader)
-            ReleaseMemory();    // free the memory used by the SequenceReader
-    }
-
-    // if we are reading from the cache, do so now and return
-    if (m_cachingReader)
-    {
-        m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
-        return;
-    } 
-
-    if (m_featuresBuffer==NULL)
-    {
-        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-        m_featuresBuffer = new ElemType[mbSize*labelInfo.dim];
-        memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim);
-    }
-
-    if (m_labelsBuffer==NULL)
-    {
-        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-        if (labelInfo.type == labelCategory)
-        {
-            m_labelsBuffer = new ElemType[labelInfo.dim*mbSize];
-            memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize);
-            m_labelsIdBuffer = new IDataReader<ElemType>::LabelIdType[mbSize];
-            memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*mbSize);
-        }
-        else if (labelInfo.type != labelNone)
-        {
-            m_labelsBuffer = new ElemType[mbSize];
-            memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize);
-            m_labelsIdBuffer = NULL;
-        }
-    }
-
-    m_mbSize = mbSize;
-    if (requestedEpochSamples == requestDataSize)
-    {
-        if (!m_endReached)
-        {
-            m_epochSize = requestDataSize;
-        }
-    }
-    else
-    {
-        m_epochSize = requestedEpochSamples;
-    }
-    
-    // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
-    size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize;
-    m_epoch = epoch;
-    m_mbStartSample = epoch*m_epochSize;
-
-    // allocate room for the data
-    m_featureData.reserve(m_featureCount*epochSize);
-    if (m_labelInfo[labelInfoOut].type == labelCategory)
-        m_labelIdData.reserve(epochSize);
-    else if (m_labelInfo[labelInfoOut].type != labelNone)
-        m_labelData.reserve(epochSize);
-    m_sequence.reserve(m_seqIndex); // clear out the sequence array
-    /// this is too complicated for LM 
-    // SetupEpoch(); 
-    /// use the LMSetupEpoch() instead
-    LMSetupEpoch();
-
-    m_clsinfoRead = false; 
-    m_idx2clsRead = false; 
-
-    m_parser.ParseReset(); 
-}
-
-template<class ElemType>
-bool SequenceReader<ElemType>::DataEnd(EndDataType endDataType)
-{
-    bool ret = false;
-    switch (endDataType)
-    {
-    case endDataNull:
-        assert(false);
-        break;
-    case endDataEpoch:
-        ret = m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1];
-        break;
-    case endDataSet:
-        ret = !EnsureDataAvailable(m_mbStartSample);
-        break;
-    case endDataSentence:  // for fast reader each minibatch is considered a "sentence", so always true
-        ret = SentenceEnd();
-        break;
-    }
-    return ret;
-}
-
-
-template<class ElemType>
-bool SequenceReader<ElemType>::SentenceEnd()
-{
-    // this is after getMinibatch size, which has increased m_seqIndex by 1
-    // so the real index is m_seqIndex - 1; 
-    int seqIndex = (int)m_seqIndex - 1; 
-
-    // now get the labels
-    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-
-    size_t actualmbsize = 0;
-
-    // figure out the size of the next sequence
-    if (seqIndex > 0)
-    {
-        actualmbsize = m_sequence[seqIndex] - m_sequence[seqIndex-1];   
-    }
-    else
-    {
-        actualmbsize = m_sequence[0];
-    }
-
-    if (actualmbsize < m_mbSize)
-        return true;
-
-    size_t jEnd = m_sequence[seqIndex]-1;
-         
-    if (labelInfo.type == labelCategory)
-    {
-        LabelIdType index ;
-        if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false)
-            RuntimeError("cannot find sentence begining label");
-
-        if (m_labelIdData[jEnd] == index )
-            return true; 
-        else 
-            return false; 
-    }
-    return false; 
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, Matrix<ElemType>*>& matrices, 
-                                              size_t m_mbStartSample, size_t actualmbsize)
-{
-    size_t j = 0;
-    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
-    if (labels == nullptr) return;
-    
-    labels->Resize(nwords + class_size, actualmbsize, false);
-        
-    for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
-    {
-        // pick the right sample with randomization if desired
-        size_t jRand = jSample;
-         
-        int    wrd = m_labelIdData[jRand];
-        int    clsidx = idx4class[wrd]; 
-        
-        labels->SetValue(wrd, j, 1); 
-
-        if (class_size > 0)
-            labels->SetValue(nwords + clsidx, j, 1); 
-    }
-
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::GetInputToClass(std::map<std::wstring, Matrix<ElemType>*>& matrices)
-{
-    Matrix<ElemType>* idx2cls= matrices[STRIDX2CLS];
-    if (idx2cls== nullptr) return;
-
-    if (m_idx2clsRead) return;
-
-    // populate local CPU matrix
-    m_id2classLocal->SwitchToMatrixType(MatrixType::DENSE);
-    m_id2classLocal->Resize(nwords , 1, false);        
-
-    //move to CPU since element-wise operation is expensive and can go wrong in GPU
-    int curDevId = m_id2classLocal->GetDeviceId();
-    m_id2classLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
-    for (size_t j = 0; j < nwords ; j++) 
-    {
-        int clsidx = idx4class[(int)j];
-        (*m_id2classLocal)(j,0) = (float)clsidx; 
-    }
-    m_id2classLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
-
-    int oldDeviceId = idx2cls->GetDeviceId();
-    // caution, SetValue changes idx2cls from GPU to CPU, may change this behavior later
-    idx2cls->SetValue(*m_id2classLocal); 
-    idx2cls->TransferFromDeviceToDevice(idx2cls->GetDeviceId(), oldDeviceId, true);
-    
-    m_idx2clsRead = true;
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::GetClassInfo(std::map<std::wstring, Matrix<ElemType>*>& matrices)
-{
-    Matrix<ElemType>* clsinfo = matrices[CLASSINFO];
-    if (clsinfo == nullptr) return;
-
-    if (m_clsinfoRead) return;
-
-    // populate local CPU matrix
-    m_classInfoLocal->SwitchToMatrixType(MatrixType::DENSE);
-    m_classInfoLocal->Resize(2, class_size);        
-
-    //move to CPU since element-wise operation is expensive and can go wrong in GPU
-    int curDevId = m_classInfoLocal->GetDeviceId();
-    m_classInfoLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
-
-    int clsidx; 
-    int prvcls = -1;
-    for (size_t j = 0; j < nwords; j++) 
-    {
-        clsidx = idx4class[(int)j]; 
-        if (prvcls != clsidx)
-        {
-            if (prvcls >= 0)
-                (*m_classInfoLocal)(1, prvcls) = (float)j;
-            prvcls = clsidx;
-            (*m_classInfoLocal)(0, prvcls) = (float)j;
-        }
-    }
-    (*m_classInfoLocal)(1, prvcls) = (float)nwords;
-
-    m_classInfoLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
-
-    int oldDeviceId = clsinfo->GetDeviceId();
-    // caution, SetValue changes m_classInfoLocal from GPU to CPU, may change this behavior later
-    clsinfo->SetValue(*m_classInfoLocal); 
-    clsinfo->TransferFromDeviceToDevice(clsinfo->GetDeviceId(), oldDeviceId, true);
-
-    m_clsinfoRead = true;
-}
-
-template<class ElemType>
-bool SequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
-{
-
-    // get out if they didn't call StartMinibatchLoop() first
-    if (m_mbSize == 0)
-        return false;
-
-    // check to see if we have changed epochs, if so we are done with this one.
-    if (m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1])
-        return false;
-
-    bool moreData = EnsureDataAvailable(m_mbStartSample);
-    if (moreData == false)
-        return false; 
-
-    // figure which sweep of the randomization we are on
-    size_t recordStart = m_totalSamples?m_mbStartSample%m_totalSamples:m_mbStartSample;
-
-    // actual size is the size of the next seqence
-    size_t actualmbsize = 0;
-
-    // figure out the size of the next sequence
-    if (m_seqIndex > 0 && m_seqIndex < m_sequence.size() && m_sequence.size() > 1)
-    {
-        actualmbsize = m_sequence[m_seqIndex] - m_sequence[m_seqIndex-1];   
-    }
-    else
-    {
-        actualmbsize = m_sequence[0];
-    }
-
-    if (actualmbsize > m_mbSize){
-        RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize);
-    }
-
-    // hit the end of the dataset, 
-    if (!moreData)
-    {
-        // make sure we take into account hitting the end of the dataset (not wrapping around)
-        actualmbsize = min(m_totalSamples-recordStart,actualmbsize);
-    }
-
-    // now get the labels
-    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-
-    if (labelInfo.type == labelCategory)
-    {
-        memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*actualmbsize);
-        memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*actualmbsize);
-    }
-    else if (labelInfo.type != labelNone)
-    {
-        memset(m_labelsBuffer,0,sizeof(ElemType)*1*actualmbsize);        
-    }
-
-    if (actualmbsize > 0)
-    {
-
-        memset(m_featuresBuffer, 0, sizeof(ElemType)*actualmbsize*labelInfo.dim);
-
-        //loop through all the samples
-        int j = 0;
-        Matrix<ElemType>& features = *matrices[m_featuresName];
-        if (matrices.find(m_featuresName) != matrices.end())
-        {
-            if(features.GetMatrixType() == MatrixType::DENSE) 
-            {
-                features.Resize(labelInfo.dim, actualmbsize, false);
-                features.SetValue(0);
-            }
-            else
-            {
-                features.Resize(labelInfo.dim, actualmbsize);
-                features.Reset();
-            }
-        }
-
-        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
-        {
-            // pick the right sample with randomization if desired
-            size_t jRand = jSample;
-         
-            // vector of feature data goes into matrix column
-            size_t idx = (size_t)m_featureData[jRand];
-            m_featuresBuffer[j*labelInfo.dim + idx] = (ElemType)1; 
-
-            if (matrices.find(m_featuresName) != matrices.end())
-                features.SetValue(idx, j, (ElemType)1); 
-        }
-
-        GetLabelOutput(matrices, m_mbStartSample, actualmbsize);
-        GetInputToClass(matrices);
-        GetClassInfo(matrices);
-
-        // make sure that the sequence index matches our end index
-        assert(m_sequence[m_seqIndex] == m_mbStartSample+actualmbsize);
-        // go to the next sequence
-        m_seqIndex++;
-    } 
-
-    // advance to the next minibatch
-    m_mbStartSample += actualmbsize;
-
-    // if they don't want partial minibatches, skip data transfer and return
-    if (actualmbsize == 0) // no records found (end of minibatch)
-    {
-        return false;
-    }
-
-    // now transfer to the GPU as needed
-    try{
-        // get the features array
-        if (matrices.find(m_featuresName) == matrices.end())
-        {
-            Matrix<ElemType>& nbs = *matrices[L"numberobs"];
-            int curDevId = nbs.GetDeviceId();
-            nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
-            nbs(0,0) = (float)actualmbsize;
-            nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
-            for (size_t i = 0; i < actualmbsize; i++)
-            {
-                std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i);
-                Matrix<ElemType>& features = *matrices[ws];
-                features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal);
-            }
-        }
-    }catch(...)
-    {
-        RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition"  ,actualmbsize);
-    }
-
-    try
-    {
-        if (labelInfo.type == labelCategory)
-        {
-            if (matrices.find(m_labelsName[labelInfoOut]) == matrices.end())
-            {
-                for (size_t i = 0; i < actualmbsize; i++)
-                {
-                    std::wstring ws = msra::strfun::wstrprintf (L"label%d", i);
-                    Matrix<ElemType>* labels = matrices[ws]; 
-                    labels->SetValue(labelInfo.dim, 1, &m_labelsBuffer[i * labelInfo.dim],matrixFlagNormal);
-                }
-            }
-        }
-        else if (labelInfo.type != labelNone)
-        {
-            Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
-            labels->SetValue(1, actualmbsize,m_labelsBuffer,matrixFlagNormal);
-        }
-    }catch(...)
-    {
-        RuntimeError("cannot find matrices for %s", m_labelsName[labelInfoOut]);
-    }
-
-    // we read some records, so process them
-    return true;
-}
-
-template<class ElemType>
-void SequenceReader<ElemType>::OrganizeClass()
-{
-    //allocate auxiliary class variables (for faster search when normalizing probability at output layer)
-    int cl, i;
-    for (i=0; i<class_size; i++) {
-        class_cn.push_back(0); 
-    }
-
-    for (i=0; i<nwords; i++) {
-        cl=idx4class[i];
-        class_words[cl].push_back(i); 
-        class_cn[cl]++;
-    }
-
-    for (i=0; i<class_size; i++) {
-        if (class_cn[i] == 0) {
-            RuntimeError ("class is empty");
-        }
-    }
-}
-
-// GetLabelMapping - Gets the label mapping from integer index to label type 
-// returns - a map from numeric datatype to native label type 
-template<class ElemType>
-const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& SequenceReader<ElemType>::GetLabelMapping(const std::wstring& sectionName)
-{
-    if (m_cachingReader)
-    {
-        return m_cachingReader->GetLabelMapping(sectionName);
-    }
-    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-
-    return labelInfo.mapIdToLabel;
-}
-
-// SetLabelMapping - Sets the label mapping from integer index to label 
-// labelMapping - mapping table from label values to IDs (must be 0-n)
-// note: for tasks with labels, the mapping table must be the same between a training run and a testing run 
-template<class ElemType>
-void SequenceReader<ElemType>::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename IDataReader<ElemType>::LabelIdType, typename LabelType>& labelMapping)
-{
-    if (m_cachingReader)
-    {
-        RuntimeError("Cannot set mapping table when the caching reader is being used");
-    }
-    LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-
-    labelInfo.mapIdToLabel = labelMapping;
-    labelInfo.mapLabelToId.clear();
-    for (std::pair<unsigned, LabelType> var : labelMapping)
-    {
-        labelInfo.mapLabelToId[var.second] = var.first;
-    }
-}
-
-// GetData - Gets metadata from the specified section (into CPU memory) 
-// sectionName - section name to retrieve data from
-// numRecords - number of records to read
-// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
-// dataBufferSize - [in] size of the databuffer in bytes
-//                  [out] size of buffer filled with data
-// recordStart - record to start reading from, defaults to zero (start of data)
-// returns: true if data remains to be read, false if the end of data was reached
-template<class ElemType>
-bool SequenceReader<ElemType>::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart)
-{
-    if (!m_cachingReader)
-        RuntimeError("GetData not supported in SequenceReader");
-    return m_cachingReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart);
-}
-
-// instantiate all the combinations we expect to be used
-template class SequenceReader<double>; 
-template class SequenceReader<float>;
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)
-{
-    // See if the user wants caching
-    m_cachingReader = NULL;
-    m_cachingWriter = NULL;
-
-    // NOTE: probably want to re-enable at some point
-
-    // initialize the cache
-    //InitCache(readerConfig);
-    //m_readerConfig = readerConfig;
-
-    //// if we have a cache, no need to parse the test files...
-    //if (m_cachingReader)
-    //    return;
-
-    std::vector<std::wstring> features;
-    std::vector<std::wstring> labels;
-    GetFileConfigNames(readerConfig, features, labels);
-    if (features.size() > 0)
-    {
-        m_featuresName = features[0];
-    }
-
-    if (labels.size() == 2)
-    {
-        for (int index = labelInfoMin; index < labelInfoMax; ++index)
-        {
-            m_labelsName[index] = labels[index];
-        }
-    }
-    else
-        RuntimeError("two label definitions (in and out) required for Sequence Reader");
-
-    ConfigParameters featureConfig = readerConfig(m_featuresName,"");
-    ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")};
-
-    class_size = 0;
-    m_featureDim = featureConfig("dim");
-    for (int index = labelInfoMin; index < labelInfoMax; ++index)
-    {
-        m_labelInfo[index].idMax = 0; 
-        m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", "");
-        m_labelInfo[index].endSequence = labelConfig[index]("endSequence", "");
-
-        // determine label type desired
-        std::string labelType(labelConfig[index]("labelType","Category"));
-        if (labelType == "Category")
-        {
-            m_labelInfo[index].type = labelCategory;
-        }
-        else if (labelType == "NextWord")
-        {
-            // in this case, it's all identical to the Input labels, except the data type
-            m_labelInfo[index].type = labelNextWord;
-            m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim;
-        }
-        else if (labelType == "None")
-        {
-            m_labelInfo[index].type = labelNone;
-            m_labelInfo[index].dim = 0;   // override for no labels
-        }
-        
-        // if we have labels, we need a label Mapping file, it will be a file with one label per line
-        if (m_labelInfo[index].type != labelNone)
-        {
-            std::wstring wClassFile = readerConfig("wordclass", "");
-            nwords = labelConfig[index]("labelDim");
-            if (wClassFile != L""){
-                ReadClassInfo(wClassFile  , false);
-            }
-
-            std::vector<string> arrayLabels;
-            std::wstring labelPath = labelConfig[index]("labelMappingFile");
-            if (fexists(labelPath))
-            {
-                LoadLabelFile(labelPath, arrayLabels);
-                for (int i=0; i < arrayLabels.size(); ++i)
-                {
-                    LabelType label = arrayLabels[i];
-                    m_labelInfo[index].mapIdToLabel[i] = label;
-                    m_labelInfo[index].mapLabelToId[label] = i;
-                }
-                m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size();
-                m_labelInfo[index].mapName = labelPath;
-            }
-            else
-            {
-                if (wClassFile != L""){
-                    ReadClassInfo(wClassFile  , false);
-                    int iMax = -1, i; 
-                    for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++)
-                    {
-                        LabelType label = ptr->first; 
-                        i = ptr->second; 
-                        iMax = max(i, iMax);
-                        m_labelInfo[index].mapIdToLabel[i] = label;
-                        m_labelInfo[index].mapLabelToId[label] = i;
-                    }
-                    m_labelInfo[index].idMax = (LabelIdType)(iMax+1);
-
-                    OrganizeClass();
-
-                }
-                m_labelInfo[index].mapName = labelPath;
-
-                m_labelInfo[index].fileToWrite = labelPath;
-            }
-        }
-
-        m_labelInfo[index].dim = labelConfig[index]("labelDim");
-
-        // update dimension if the file says it's bigger
-        if (m_labelInfo[index].dim < m_labelInfo[index].idMax)
-        {
-            m_labelInfo[index].dim = m_labelInfo[index].idMax;
-        }
-    }
-
-    // initialize all the variables
-    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0;
-    m_endReached = false;
-    m_readNextSampleLine = 0;
-    m_readNextSample = 0;
-    m_traceLevel = readerConfig("traceLevel","0");
-    m_parser.SetTraceLevel(m_traceLevel);
-
-    if (readerConfig.Exists("randomize"))
-    {
-        string randomizeString = readerConfig("randomize");
-        if (randomizeString == "None")
-        {
-            ;
-        }
-        else if (randomizeString == "Auto")
-        {
-            ;
-        }
-        else
-        {
-            ;//readerConfig("randomize");
-        }
-    }
-    else
-    {
-        ; //randomizeAuto;
-    }
-
-    // The input data is a combination of the label Data and extra feature dims together
-//    m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim;
-    m_featureCount = 1; 
-
-    std::wstring m_file = readerConfig("file");
-    if (m_traceLevel > 0)
-        fprintf(stderr, "reading sequence file %ws\n", m_file.c_str());
-
-    const LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-    const LabelInfo& labelOut = m_labelInfo[labelInfoOut];
-    m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence);
-
-    mBlgSize = readerConfig("nbruttsineachrecurrentiter", "1");
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::Reset()
-{
-    mProcessed.clear();
-    mToProcess.clear();
-    mLastProcssedSentenceId = 0;
-    mPosInSentence = 0;
-    mLastPosInSentence = 0;
-    mNumRead = 0;
-
-    if (m_labelTemp.size() > 0)
-        m_labelTemp.clear();
-    if (m_featureTemp.size() > 0)
-        m_featureTemp.clear();
-    m_parser.mSentenceIndex2SentenceInfo.clear();
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
-{
-    // if we aren't currently caching, see if we can use a cache
-    if (!m_cachingReader && !m_cachingWriter)
-    {
-        InitCache(m_readerConfig);
-        if (m_cachingReader)
-            ReleaseMemory();    // free the memory used by the SequenceReader
-    }
-
-    // if we are reading from the cache, do so now and return
-    if (m_cachingReader)
-    {
-        m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
-        return;
-    } 
-
-    if (m_featuresBuffer==NULL)
-    {
-        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-        m_featuresBuffer = new ElemType[mbSize*labelInfo.dim];
-        memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim);
-    }
-
-    if (m_labelsBuffer==NULL)
-    {
-        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-        if (labelInfo.type == labelCategory)
-        {
-            m_labelsBuffer = new ElemType[labelInfo.dim*mbSize];
-            memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize);
-            m_labelsIdBuffer = new IDataReader<ElemType>::LabelIdType[mbSize];
-            memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*mbSize);
-        }
-        else if (labelInfo.type != labelNone)
-        {
-            m_labelsBuffer = new ElemType[mbSize];
-            memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize);
-            m_labelsIdBuffer = NULL;
-        }
-    }      
-
-    m_featuresBufferRow = new size_t[mbSize];
-    m_featuresBufferRowIdx = new size_t[mbSize];
-
-    m_labelsIdBufferRow = new CPUSPARSE_INDEX_TYPE[2 * mbSize];
-    m_labelsBlock2Id = new size_t[2*mbSize];
-    m_labelsBlock2UniqId = new size_t[2*mbSize];
-
-    m_id2classLocal = new Matrix<ElemType>(CPUDEVICE);
-    m_classInfoLocal = new Matrix<ElemType>(CPUDEVICE);
-        
-    m_mbSize = mbSize;
-    if (requestedEpochSamples == requestDataSize)
-    {
-        if (!m_endReached)
-        {
-            m_epochSize = requestDataSize;
-        }
-    }
-    else
-    {
-        m_epochSize = requestedEpochSamples;
-    }
-    
-    // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
-    size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize;
-    m_epoch = epoch;
-    m_mbStartSample = epoch*m_epochSize;
-
-    // allocate room for the data
-    m_featureData.reserve(m_featureCount*epochSize);
-    if (m_labelInfo[labelInfoOut].type == labelCategory)
-        m_labelIdData.reserve(epochSize); 
-    else if (m_labelInfo[labelInfoOut].type != labelNone)
-        m_labelData.reserve(epochSize);
-    m_sequence.reserve(m_seqIndex); // clear out the sequence array
-    /// this is too complicated for LM 
-    // SetupEpoch(); 
-    /// use the LMSetupEpoch() instead
-    LMSetupEpoch();
-
-    m_clsinfoRead = false; 
-    m_idx2clsRead = false; 
-
-    m_parser.ParseReset(); 
-
-    Reset();
-}
-
-template<class ElemType>
-size_t BatchSequenceReader<ElemType>::FindNextSentences(size_t numRead)
-{  
-    size_t sln = 0;
-
-    if (numRead == 0) return 0;
-
-    if (mProcessed.size() == 0)
-    {
-        mProcessed.resize(numRead, false);
-    }
-
-    if (mToProcess.size() > 0)
-    {
-        bool allDone = false; 
-        for (int s = 0; s < mToProcess.size(); s++)
-        {
-            int mp = (int)mToProcess[s];
-            if (mProcessed[mp])
-            {
-                mLastProcssedSentenceId = mp;
-                mLastPosInSentence = 0;
-                allDone = true;
-                break;
-            }
-        }
-        if (allDone)
-        {
-            mToProcess.clear();
-        }
-    }
-
-    if (mToProcess.size() > 0)
-    {
-        sln = m_parser.mSentenceIndex2SentenceInfo[mToProcess[0]].sLen;
-        return sln;
-    }
-
-    for (size_t seq = mLastProcssedSentenceId ; seq < numRead; seq++)
-    {
-        if (mProcessed[seq]) continue;
-        
-        if (sln == 0)
-        {
-            sln = m_parser.mSentenceIndex2SentenceInfo[seq].sLen;
-        }
-        if (sln == m_parser.mSentenceIndex2SentenceInfo[seq].sLen &&
-            mProcessed[seq] == false && mToProcess.size() < mBlgSize)
-            mToProcess.push_back(seq);
-
-        if (mToProcess.size() == mBlgSize) break;
-    }
-
-    return sln;
-}
-
-template<class ElemType>
-bool BatchSequenceReader<ElemType>::EnsureDataAvailable(size_t /*mbStartSample*/)
-{
-    bool bDataIsThere = true; 
-
-    m_featureData.clear();
-    m_labelIdData.clear();
-
-    // now get the labels
-    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-
-    bool nextWord = false;
-    if (m_labelInfo[labelInfoOut].type == labelNextWord)
-    {
-        nextWord = true;
-    }
-    LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut];
-
-    // see how many we already read
-    std::vector<SequencePosition> seqPos;
-    
-    size_t sLn = FindNextSentences(mNumRead);
-    if (sLn == 0)
-    {
-        Reset();
-
-        mNumRead = m_parser.Parse(CACHE_BLOG_SIZE, &m_labelTemp, &m_featureTemp, &seqPos);
-        if (mNumRead == 0) return false;
-
-        std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end());
-
-        m_readNextSampleLine += mNumRead;
-        sLn = FindNextSentences(mNumRead);
-    }
-
-    /// add one minibatch 
-    size_t i = mLastPosInSentence; 
-    size_t j = 0;
-    // exclude the last token since it is the last label to be predicted
-    for (i = mLastPosInSentence; j < m_mbSize &&  i < sLn-1; i++ , j++)
-    {
-        for (int k = 0; k < mToProcess.size(); k++)
-        {
-            size_t seq = mToProcess[k];
-            size_t label = m_parser.mSentenceIndex2SentenceInfo[seq].sBegin + i;
-
-            // labelIn should be a category label 
-            LabelType labelValue = m_labelTemp[label++];
-
-            // to-do, should ignore <s>, check the sentence ending is </s> 
-            // need to remove <s> from the training set
-            // allocate and initialize the next chunck of featureData
-            if (labelIn.type == labelCategory)
-            {
-                LabelIdType index = GetIdFromLabel(labelValue, labelIn);
-
-                // use the found value, and set the appropriate location to a 1.0
-                assert(labelIn.dim > index); // if this goes off labelOut dimension is too small
-                m_featureData.push_back((float)index);
-            }
-            else
-            {
-                RuntimeError("Input label expected to be a category label");
-            }
-
-            // now get the output label
-            if (m_labelInfo[labelInfoOut].type == labelCategory)
-            {
-                labelValue = m_labelTemp[label++];
-            }
-            else if (nextWord)
-            {
-                // this is the next word (label was incremented above)
-                labelValue = m_labelTemp[label];
-                if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
-                {
-                    labelValue = labelInfo.endSequence;
-                }
-            }
-            else
-            {
-                RuntimeError("Invalid output label type, expected Category, or Next Word");
-            }
-
-            // get the ID from the label
-            LabelIdType id = GetIdFromLabel(labelValue, labelInfo);
-            m_labelIdData.push_back(id);
-
-            m_totalSamples ++;
-        }
-    }
-
-    mLastPosInSentence = i;
-
-    return bDataIsThere;
-}
-
-template<class ElemType>
-size_t BatchSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
-{
-    size_t sz = mToProcess.size();
-    return sz; 
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::SetNbrSlicesEachRecurrentIter(const size_t mz)
-{
-    mBlgSize = mz;
-}
-
-template<class ElemType>
-bool BatchSequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
-{
-
-    // get out if they didn't call StartMinibatchLoop() first
-    if (m_mbSize == 0)
-        return false;
-
-    bool moreData = EnsureDataAvailable(m_mbStartSample);
-    if (moreData == false)
-        return false; 
-
-    // actual size is the size of the next seqence
-    size_t actualmbsize = 0;
-
-    // figure out the size of the next sequence
-    actualmbsize = m_labelIdData.size() ; 
-    if (actualmbsize > m_mbSize * mToProcess.size()){
-        RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize);
-    }
-
-    // now get the labels
-    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
-
-    if (actualmbsize > 0)
-    {
-
-        //loop through all the samples
-        Matrix<ElemType>& features = *matrices[m_featuresName];
-      
-        // copy m_featureData to matrix
-        // we always copy it to cpu first and then convert to gpu if gpu is desired.
-        DEVICEID_TYPE featureDeviceId = features.GetDeviceId();
-        features.TransferFromDeviceToDevice(featureDeviceId, CPUDEVICE, false, true, false);
-
-        if (features.GetMatrixType() == MatrixType::DENSE)
-        {
-            features.Resize(labelInfo.dim, actualmbsize);
-            features.SetValue(0);
-        }
-        else
-        {
-            features.Resize(labelInfo.dim, actualmbsize, actualmbsize);
-            features.Reset();
-        }
-
-        for (size_t j = 0; j < actualmbsize; ++j)
-        {
-            // vector of feature data goes into matrix column
-            size_t idx = (size_t)m_featureData[j];
-
-            //if (matrices.find(m_featuresName) != matrices.end())
-                features.SetValue(idx, j, (ElemType)1);
-        }
-        
-        features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false);
-
-        //else // for GPU
-        //{
-        //    if (matrices.find(m_featuresName) != matrices.end())
-        //    {
-        //        m_indexer.clear();
-        //        size_t size = m_featureData.size();
-
-        //        for(int i = 0; i < size; i++) 
-        //        {
-        //            m_featuresBufferRow[i] = (size_t)m_featureData[i];                    
-        //            if(m_indexer.find(m_featuresBufferRow[i]) == m_indexer.end()) 
-        //            {
-        //                m_indexer[m_featuresBufferRow[i]] = m_indexer.size();
-        //            }
-        //            m_featuresBufferRowIdx[i] = m_indexer[m_featuresBufferRow[i]];
-        //        }               
-        //        features.SetMatrixFromCSCFormat(m_featuresBufferRow, m_featuresBufferRowIdx, size, m_indexer.size());
-        //    }
-        //}
-                
-        // TODO: move these two methods to startMiniBatchLoop()
-        GetInputToClass(matrices);
-        GetClassInfo(matrices);
-        GetLabelOutput(matrices, 0, actualmbsize);
-
-        // go to the next sequence
-        m_seqIndex++;
-    } 
-    else
-        return false; 
-
-    // now transfer to the GPU as needed
-    try{
-        // get the features array
-        if (matrices.find(m_featuresName) == matrices.end())
-        {
-            Matrix<ElemType>& nbs = *matrices[L"numberobs"];
-            int curDevId = nbs.GetDeviceId();
-            nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
-            nbs(0,0) = (float)actualmbsize;
-            nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
-            for (size_t i = 0; i < actualmbsize; i++)
-            {
-                std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i);
-                Matrix<ElemType>& features = *matrices[ws];
-                features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal);
-            }
-        }
-    }catch(...)
-    {
-        RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition"  ,actualmbsize);
-    }
-
-    // we read some records, so process them
-    return true;
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::SetSentenceEnd(int wrd, int pos, int actualMbSize)
-{
-    // now get the labels
-    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-    LabelIdType index = GetIdFromLabel(labelIn.endSequence.c_str(), labelIn);
-
-    if (pos == actualMbSize - 1) 
-    {
-        if (wrd == (int)index)
-            mSentenceEnd = true;
-        else
-            mSentenceEnd = false; 
-    }
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::SetSentenceBegin(int wrd, int pos, int /*actualMbSize*/)
-{
-    // now get the labels
-    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
-    LabelIdType index = GetIdFromLabel(labelIn.beginSequence.c_str(), labelIn);
-
-    if (pos == 0) 
-    {
-        if (wrd == (int)index)
-            mSentenceBegin = true;
-        else
-            mSentenceBegin = false; 
-    }
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::SetSentenceEndInBatch(vector<size_t> &sentenceEnd)
-{
-    sentenceEnd.resize(mToProcess.size());
-    if (mSentenceBegin)
-    {
-        sentenceEnd.assign(mToProcess.size(), 0);
-    }
-    else
-    {
-        sentenceEnd.assign(mToProcess.size(), m_mbSize+2);
-    }
-}
-
-template<class ElemType>
-bool BatchSequenceReader<ElemType>::DataEnd(EndDataType endDataType)
-{
-    bool ret = false;
-    switch (endDataType)
-    {
-    case endDataNull:
-        assert(false);
-        break;
-    case endDataEpoch:
-    case endDataSet:
-        ret = !EnsureDataAvailable(m_mbStartSample);
-        break;
-    case endDataSentence:  // for fast reader each minibatch is considered a "sentence", so always true
-        if (mSentenceEnd)
-        {
-            for (auto ptr = mToProcess.begin(); ptr != mToProcess.end(); ptr++)
-                mProcessed[*ptr] = true;
-        }
-        ret = mSentenceEnd;
-        break;
-    }
-    return ret;
-
-}
-
-template<class ElemType>
-void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, 
-    Matrix<ElemType>*>& matrices, 
-    size_t m_mbStartSample, size_t actualmbsize)
-{
-    size_t j = 0;
-    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]]; 
-    if (labels == nullptr) return;
-    
-    if(labels->GetMatrixType() == MatrixType::DENSE) 
-    {
-        labels->Resize(nwords + class_size, actualmbsize, false);
-        labels->SetValue(0);
-    }
-    else 
-    {
-        labels->Resize(nwords + class_size, actualmbsize, 2*actualmbsize);
-        labels->Reset();
-    }
-
-    if(labels->GetCurrentMatrixLocation() == CPU) {
-        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
-        {
-            // pick the right sample with randomization if desired
-            size_t jRand = jSample;
-
-            int    wrd = m_labelIdData[jRand];
-            int    clsidx = idx4class[wrd]; 
-
-            labels->SetValue(wrd, j, 1); 
-
-            SetSentenceEnd(wrd, j, actualmbsize);
-            SetSentenceBegin(wrd, j, actualmbsize);
-
-            if (class_size > 0)
-                labels->SetValue(nwords + clsidx, j, 1); 
-        }
-    }
-    else // GPU
-    {
-        m_indexer.clear();
-        int p = 0;
-        int b = 0; 
-        int nz = 0;
-        
-        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
-        {
-            // pick the right sample with randomization if desired
-            size_t jRand = jSample;         
-            int    wrd = m_labelIdData[jRand];
-            int    clsidx = idx4class[wrd];         
-            SetSentenceEnd(wrd, j, actualmbsize);
-            SetSentenceBegin(wrd, j, actualmbsize);
-
-            int start[2];
-            int end[2];
-            int target[2];
-            int blockId[2];
-
-            start[0] = (int)(*m_classInfoLocal)(0, clsidx);
-            end[0] = (int)(*m_classInfoLocal)(1, clsidx);
-            target[0] = wrd;
-            blockId[0] = clsidx;
-            start[1] = nwords;
-            end[1] = nwords + (int)(*m_classInfoLocal).GetNumCols();
-            target[1] = nwords + clsidx;
-            blockId[1] = -1;
-
-            for(int i = 0; i < 2; i++) 
-            {
-                m_labelsIdBufferRow[p] = target[i];
-                int len = end[i] - start[i];
-                
-                if(m_indexer.find(blockId[i]) == m_indexer.end()) 
-                {
-                    m_indexer[blockId[i]] = b;                    
-                    b += len;
-                }   
-                m_labelsBlock2Id[p] = nz;
-                m_labelsBlock2UniqId[p] = m_indexer[blockId[i]];
-                nz += len;
-                p++;
-            }
-        }
-        
-        labels->SetMatrixFromLabelAndClass(m_labelsIdBufferRow, m_labelsBlock2Id, m_labelsBlock2UniqId, 2*actualmbsize, nz, b);
-    }
-}
-
-template class BatchSequenceReader<double>; 
-template class BatchSequenceReader<float>;
-
+//
+// <copyright file="SequenceReader.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// SequenceReader.cpp : Defines the exported functions for the DLL application.
+//
+
+
+#include "stdafx.h"
+#define DATAREADER_EXPORTS  // creating the exports here
+#include "DataReader.h"
+#include "SequenceReader.h"
+#ifdef LEAKDETECT
+#include <vld.h> // leak detection
+#endif
+#include "fileutil.h"   // for fexists()
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// ReadLine - Read a line
+// readSample - sample to read in global sample space
+// returns - true if we successfully read a record, otherwise false
+template<class ElemType>
+bool SequenceReader<ElemType>::ReadRecord(size_t /*readSample*/)
+{
+    return false; // not used
+}
+
+// RecordsToRead - Determine number of records to read to populate record buffers
+// mbStartSample - the starting sample from which to read
+// tail - we are checking for possible remainer records to read (default false)
+// returns - true if we have more to read, false if we hit the end of the dataset
+template<class ElemType>
+size_t SequenceReader<ElemType>::RecordsToRead(size_t mbStartSample, bool tail)
+{
+    assert(mbStartSample >= m_epochStartSample);
+    // determine how far ahead we need to read
+    // need to read to the end of the next minibatch
+    size_t epochSample = mbStartSample;
+    epochSample %= m_epochSize;
+
+    // determine number left to read for this epoch
+    size_t numberToEpoch = m_epochSize - epochSample;
+    // we will take either a minibatch or the number left in the epoch
+    size_t numberToRead = min(numberToEpoch, m_mbSize);
+    if (numberToRead == 0 && !tail)
+        numberToRead = m_mbSize;
+
+    return numberToRead;
+}
+
+// GetIdFromLabel - get an Id from a Label
+// mbStartSample - the starting sample we are ensureing are good
+// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
+// returns - true if we have more to read, false if we hit the end of the dataset
+template<class ElemType>
+/*IDataReader<ElemType>::LabelIdType*/ unsigned SequenceReader<ElemType>::GetIdFromLabel(const std::string& labelValue, LabelInfo& labelInfo)
+{
+    auto found = labelInfo.mapLabelToId.find(labelValue);
+
+    // not yet found, add to the map
+    if (found == labelInfo.mapLabelToId.end())
+    {
+        RuntimeError("%s not in vocabulary", labelValue.c_str());
+    }
+    return found->second;
+}
+
+template<class ElemType>
+/*IDataReader<ElemType>::LabelIdType*/ bool SequenceReader<ElemType>::CheckIdFromLabel(const std::string& labelValue, const LabelInfo& labelInfo, unsigned & labelId)
+{
+    auto found = labelInfo.mapLabelToId.find(labelValue);
+
+    // not yet found, add to the map
+    if (found == labelInfo.mapLabelToId.end())
+    {
+        return false; 
+    }
+    labelId = found->second;
+    return true; 
+}
+
+// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested
+// mbStartSample - the starting sample we are starting with
+// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
+// returns - true if we have more to read, false if we hit the end of the dataset
+template<class ElemType>
+bool SequenceReader<ElemType>::EnsureDataAvailable(size_t mbStartSample, bool /*endOfDataCheck*/)
+{
+    assert(mbStartSample >= m_epochStartSample);
+    // determine how far ahead we need to read
+    // need to read to the end of the next minibatch
+    size_t epochSample = mbStartSample;
+    bool moreToRead = true;
+
+    size_t numberToRead = RecordsToRead(mbStartSample);
+
+    // check to see if we have the proper records read already
+    if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample)
+        return true;
+
+    // if we have another sequence already read and waiting, just return now
+    if (m_seqIndex < m_sequence.size())
+        return true;
+
+    m_seqIndex = 0;
+    m_mbStartSample = 0;
+    m_sequence.clear();
+    m_featureData.clear();
+    m_labelIdData.clear();
+
+    m_readNextSample = 0;
+    epochSample = 0; 
+
+    // now get the labels
+    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+
+    bool nextWord = false;
+    if (m_labelInfo[labelInfoOut].type == labelNextWord)
+    {
+        nextWord = true;
+    }
+    LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut];
+
+    //if (m_labelIdData.size() > epochSample)
+    //{
+    //    m_labelIdData.resize(epochSample);
+    //    m_labelData.resize(epochSample*labelInfo.dim);
+    //}
+
+    // see how many we already read
+    int sequencesRead = 0;
+    std::vector<ElemType> featureTemp;
+    std::vector<LabelType> labelTemp;
+    std::vector<SequencePosition> seqPos;
+    do
+    {
+        int numRead = m_parser.Parse(CACHE_BLOG_SIZE, &labelTemp, &featureTemp, &seqPos);
+        moreToRead = (numRead != 0);
+
+        // translate from the sparse parsed data format to the to the training format data
+        int label = 0;
+        bool bSentenceStart = false;
+        SequencePosition sposLast = SequencePosition(0,0,seqFlagNull);
+        for (int seq = 0; seq < numRead; seq++)
+        {
+            // check 
+            SequencePosition spos = seqPos[seq];
+            if (spos.labelPos == sposLast.labelPos && spos.numberPos == sposLast.numberPos)
+                continue;
+            sposLast = spos;
+
+            bSentenceStart = true; 
+
+            // loop through the labels for this entry
+            while (label < spos.labelPos)  /// need to minus one since 
+            {
+
+                // labelIn should be a category label 
+                LabelType labelValue = labelTemp[label++];
+
+                if (trim(labelValue).size() == 0)
+                    continue; // empty input
+
+                // check for end of sequence marker
+                if (!bSentenceStart && (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()) || ((label - 1 )% m_mbSize == 0) ))
+                {
+                    // ignore those cases where $</s> is put in the begining, because those are used for initialization purpose
+                    spos.flags |= seqFlagStopLabel;
+                    sequencesRead++;
+
+                    // create the seqence table
+                    m_sequence.push_back(epochSample);
+                    if ((m_sequence.size() == 1 ? epochSample : epochSample - m_sequence[m_sequence.size()-2]) > m_mbSize)
+                    {
+                        fprintf(stderr, "read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample);
+                        RuntimeError("read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample);
+                    }
+
+                    if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
+                        continue; /// ignore sentence ending
+                }
+
+                // to-do, should ignore <s>, check the sentence ending is </s> 
+                // need to remove <s> from the training set
+                // allocate and initialize the next chunck of featureData
+                if (labelIn.type == labelCategory)
+                {
+                    LabelIdType index = GetIdFromLabel(labelValue, labelIn);
+
+                    // use the found value, and set the appropriate location to a 1.0
+                    assert(labelIn.dim > index); // if this goes off labelOut dimension is too small
+                    m_featureData.push_back((float)index);
+                }
+                else
+                {
+                    RuntimeError("Input label expected to be a category label");
+                }
+
+                // if we have potential features
+                if (m_featureDim > 0)
+                {
+                    RuntimeError("to-do. Assume sparse input feature. need to change the code from dense matrix");
+                    // move the position up to the start of the additional features section
+/*                    pos += labelIn.dim;
+                    assert(pos + m_featureDim == m_featureData.size());
+                    // this has to be an even number, a pair of index and value
+                    if  ((spos.numberPos&1) != 0)
+                        RuntimeError("Features must be specified in pairs (index:value). Invalid features for label '%s'\n", labelValue);
+                
+                    while (feature < spos.numberPos)
+                    {
+                        int index = (int)featureTemp[feature++];
+                        if (index < 0 || index >= m_featureDim)
+                            RuntimeError("Invalid feature index: %d for label '%s', feature max dimension = %lld\n", index, labelValue, m_featureDim);
+
+                        ElemType value = featureTemp[feature++];
+                        m_featureData[pos+index] = value;
+                    }
+                    */
+                }
+
+                // now get the output label
+                if (m_labelInfo[labelInfoOut].type == labelCategory)
+                {
+                    labelValue = labelTemp[label++];
+                }
+                else if (nextWord)
+                {
+                    // this is the next word (label was incremented above)
+                    labelValue = labelTemp[label];
+                    if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
+                    {
+                        labelValue = labelInfo.endSequence;
+                    }
+                }
+                else
+                {
+                    RuntimeError("Invalid output label type, expected Category, or Next Word");
+                }
+
+                // get the ID from the label
+                LabelIdType id = GetIdFromLabel(labelValue, labelInfo);
+                m_labelIdData.push_back(id);
+
+                m_readNextSample++;
+                epochSample++;
+                if (!m_endReached)
+                    m_totalSamples++;   // add to the total number of records in the dataset
+
+                bSentenceStart = false;
+            }
+
+            {
+                // check if the reading is right
+                int jEnd = (int) m_labelIdData.size() - 1; 
+                LabelIdType index ;
+                if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false)
+                    RuntimeError("cannot find sentence begining label");
+
+                if (m_labelIdData[jEnd] != index )
+                     /// for language model, the first word/letter has to be <s>
+                    RuntimeError("SequenceReader: the last letter/word of a batch has to be the sentence ending symbol");
+            }
+
+        }
+
+        m_readNextSampleLine += numRead;
+    } 
+    while (sequencesRead < 1 && moreToRead); // we need to read at least one sequence or have no more data
+
+    // if we read to the end, update appropriate variables
+    if (!moreToRead)
+    {
+        UpdateDataVariables();
+    }
+
+    // if there more to read 
+    return moreToRead;
+}
+
+// UpdateDataVariables - Update variables that depend on the dataset being completely read
+template<class ElemType>
+void SequenceReader<ElemType>::UpdateDataVariables()
+{
+    // if we haven't been all the way through the file yet
+    if (!m_endReached)
+    {
+        // get the size of the dataset
+        assert(m_totalSamples*m_featureCount >= m_featureData.size());
+
+        // if they want us to determine epoch size based on dataset size, do that
+        if (m_epochSize == requestDataSize)
+        {
+            m_epochSize = m_totalSamples;
+        }
+
+        WriteLabelFile();
+
+        // we got to the end of the dataset
+        m_endReached = true;
+    }
+
+    // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
+    for (int index = labelInfoMin; index < labelInfoMax; ++index)
+    {
+        if (m_labelInfo[index].type == labelCategory && m_labelInfo[index].idMax > m_labelInfo[index].dim)
+            m_labelInfo[index].dim = m_labelInfo[index].idMax;  // update the label dimensions if different
+    }
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::WriteLabelFile()
+{
+    // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
+    for (int index = labelInfoMin; index < labelInfoMax; ++index)
+    {
+        LabelInfo& labelInfo = m_labelInfo[index];
+
+        // write out the label file if they don't have one
+        if (!labelInfo.fileToWrite.empty())
+        {
+            if (labelInfo.mapIdToLabel.size() > 0)
+            {
+                File labelFile(labelInfo.fileToWrite, fileOptionsWrite | fileOptionsText);
+                for (int i=0; i < labelInfo.mapIdToLabel.size(); ++i)
+                {
+                    labelFile << labelInfo.mapIdToLabel[i] << '\n';
+                }
+                labelInfo.fileToWrite.clear();
+            }
+            else if (!m_cachingWriter)
+            {
+                fprintf(stderr, "WARNING: file %ws NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str());
+            }
+        }
+    }
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::LoadLabelFile(const std::wstring &filePath, std::vector<LabelType>& retLabels)
+{
+    File file(filePath, fileOptionsRead);
+
+    // initialize with file name
+    std::string path = msra::strfun::utf8(filePath);
+    auto location = path.find_last_of("/\\");
+    if (location != npos)
+        path = path.substr(location+1);
+    
+    // read the entire file into a string
+    string str;
+    retLabels.resize(0);
+    while (!file.IsEOF())
+    {
+        file.GetLine(str);
+
+        // check for a comment line
+        string::size_type pos = str.find_first_not_of(" \t");
+        if (pos != -1)
+        {
+            retLabels.push_back((LabelType)trim(str));
+        }
+    }
+}
+
+
+// Destroy - cleanup and remove this class
+// NOTE: this destroys the object, and it can't be used past this point
+template<class ElemType>
+void SequenceReader<ElemType>::Destroy()
+{
+    delete this;
+}
+
+// Init - Reader Initialize for multiple data sets
+// config - [in] configuration parameters for the datareader
+// Sample format below:
+//# Parameter values for the reader
+//reader=[
+//  # reader to use
+//  readerType=SequenceReader
+//  randomize=None
+// # additional features dimension
+//  featureDim=784
+//  file=c:\data\sequence\sequence.txt
+//  labelIn=[
+//    dim=26
+//      labelMappingFile=c:\data\sequence\alphabet.txt
+//      labelType=Category
+//    beginSequence="<s>"
+//    endSequence="</s>"
+//  ]
+//  labelOut=[
+//    dim=129
+//      labelMappingFile=c:\data\sequence\phonemes.txt
+//      labelType=Category
+//    beginSequence="O"
+//    endSequence="O"
+//  ]
+//]
+template<class ElemType>
+void SequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)
+{
+    // See if the user wants caching
+    m_cachingReader = NULL;
+    m_cachingWriter = NULL;
+
+    // NOTE: probably want to re-enable at some point
+
+    // initialize the cache
+    //InitCache(readerConfig);
+    //m_readerConfig = readerConfig;
+
+    //// if we have a cache, no need to parse the test files...
+    //if (m_cachingReader)
+    //    return;
+
+    std::vector<std::wstring> features;
+    std::vector<std::wstring> labels;
+    GetFileConfigNames(readerConfig, features, labels);
+    if (features.size() > 0)
+    {
+        m_featuresName = features[0];
+    }
+
+    if (labels.size() == 2)
+    {
+        for (int index = labelInfoMin; index < labelInfoMax; ++index)
+        {
+            m_labelsName[index] = labels[index];
+        }
+    }
+    else
+        RuntimeError("two label definitions (in and out) required for Sequence Reader");
+
+    ConfigParameters featureConfig = readerConfig(m_featuresName,"");
+    ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")};
+
+    class_size = 0;
+    m_featureDim = featureConfig("dim");
+    for (int index = labelInfoMin; index < labelInfoMax; ++index)
+    {
+        m_labelInfo[index].idMax = 0; 
+        m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", "");
+        m_labelInfo[index].endSequence = labelConfig[index]("endSequence", "");
+
+        // determine label type desired
+        std::string labelType(labelConfig[index]("labelType","Category"));
+        if (labelType == "Category")
+        {
+            m_labelInfo[index].type = labelCategory;
+        }
+        else if (labelType == "NextWord")
+        {
+            // in this case, it's all identical to the Input labels, except the data type
+            m_labelInfo[index].type = labelNextWord;
+            m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim;
+        }
+        else if (labelType == "None")
+        {
+            m_labelInfo[index].type = labelNone;
+            m_labelInfo[index].dim = 0;   // override for no labels
+        }
+
+        // if we have labels, we need a label Mapping file, it will be a file with one label per line
+        if (m_labelInfo[index].type != labelNone)
+        {
+            std::wstring wClassFile = readerConfig("wordclass", "");
+            nwords = labelConfig[index]("labelDim");
+            if (wClassFile != L""){
+                ReadClassInfo(wClassFile  , false);
+            }
+
+            std::vector<string> arrayLabels;
+            std::wstring labelPath = labelConfig[index]("labelMappingFile");
+            if (fexists(labelPath))
+            {
+                LoadLabelFile(labelPath, arrayLabels);
+                for (int i=0; i < arrayLabels.size(); ++i)
+                {
+                    LabelType label = arrayLabels[i];
+                    m_labelInfo[index].mapIdToLabel[i] = label;
+                    m_labelInfo[index].mapLabelToId[label] = i;
+                }
+                m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size();
+                m_labelInfo[index].mapName = labelPath;
+            }
+            else
+            {
+                if (wClassFile != L""){
+                    ReadClassInfo(wClassFile  , false);
+                    int iMax = -1, i; 
+                    for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++)
+                    {
+                        LabelType label = ptr->first; 
+                        i = ptr->second; 
+                        iMax = max(i, iMax);
+                        m_labelInfo[index].mapIdToLabel[i] = label;
+                        m_labelInfo[index].mapLabelToId[label] = i;
+                    }
+                    m_labelInfo[index].idMax = (LabelIdType)(iMax+1);
+
+                    OrganizeClass();
+
+                }
+                m_labelInfo[index].mapName = labelPath;
+
+                m_labelInfo[index].fileToWrite = labelPath;
+            }
+        }
+
+        m_labelInfo[index].dim = labelConfig[index]("labelDim");
+
+        // update dimension if the file says it's bigger
+        if (m_labelInfo[index].dim < m_labelInfo[index].idMax)
+        {
+            m_labelInfo[index].dim = m_labelInfo[index].idMax;
+        }
+    }
+
+    // initialize all the variables
+    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0;
+    m_endReached = false;
+    m_readNextSampleLine = 0;
+    m_readNextSample = 0;
+    m_traceLevel = readerConfig("traceLevel","0");
+    m_parser.SetTraceLevel(m_traceLevel);
+
+    if (readerConfig.Exists("randomize"))
+    {
+        string randomizeString = readerConfig("randomize");
+        if (randomizeString == "None")
+        {
+            ;
+        }
+        else if (randomizeString == "Auto")
+        {
+            ;
+        }
+        else
+        {
+            ;//readerConfig("randomize");
+        }
+    }
+    else
+    {
+        ; //randomizeAuto;
+    }
+
+    // The input data is a combination of the label Data and extra feature dims together
+//    m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim;
+    m_featureCount = 1; 
+
+    std::wstring m_file = readerConfig("file");
+    if (m_traceLevel > 0)
+        fprintf(stderr, "reading sequence file %ws\n", m_file.c_str());
+
+    const LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+    const LabelInfo& labelOut = m_labelInfo[labelInfoOut];
+    m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence);
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::ReadWord(char *word, FILE *fin)
+{
+    int a=0, ch;
+
+    while (!feof(fin)) {
+        ch=fgetc(fin);
+
+        if (ch==13) continue;
+
+        if ((ch==' ') || (ch=='\t') || (ch=='\n')) {
+            if (a>0) {
+                if (ch=='\n') ungetc(ch, fin);
+                break;
+            }
+
+            if (ch=='\n') {
+                strcpy_s(word, strlen("</s>"), (char *)"</s>");
+                return;
+            }
+            else continue;
+        }
+
+        word[a]=(char)ch;
+        a++;
+
+        if (a>=MAX_STRING) {
+            //printf("Too long word found!\n");   //truncate too long words
+            a--;
+        }
+    }
+    word[a]=0;
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::ReadClassInfo(const wstring & vocfile, bool /*flatten*/) 
+{
+    char strFileName[MAX_STRING];
+    char stmp[MAX_STRING];
+    string strtmp; 
+    size_t sz;
+    int cnt, clsidx, b;
+    class_size  = 0;
+
+    wcstombs_s(&sz, strFileName, 2048, vocfile.c_str(), vocfile.length());
+
+    FILE * vin;
+    vin = fopen(strFileName, "rt") ;
+
+    if (vin == nullptr)
+    {
+        RuntimeError("cannot open word class file");
+    }
+    for (int a = 0; a < nwords; a++)
+    {
+        fscanf_s(vin, "%6d\t%10d\t", &b, &cnt);
+        ReadWord(stmp, vin);
+        fscanf_s(vin, "%d\t\n", &clsidx);
+        strtmp = stmp;
+        idx4cnt[b] = cnt;
+        word4idx[strtmp] = b;
+        idx4word[b]= strtmp;
+        
+        idx4class[b] = clsidx;
+        class_size = max(class_size, clsidx);
+    }
+    fclose(vin);
+
+    class_size ++;
+}
+
+// InitCache - Initialize the caching reader if cache files exist, otherwise the writer
+// readerConfig - reader configuration
+template<class ElemType>
+void SequenceReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
+{
+    // check for a writer tag first (lets us know we are caching)
+    if (!readerConfig.Exists("writerType"))
+        return;
+
+    // first try to open the binary cache
+    bool found = false;
+    try
+    {
+        // TODO: need to go down to all levels, maybe search for sectionType
+        ConfigArray filesList(',');
+        vector<std::wstring> names;
+        if (readerConfig.Exists("wfile"))
+        {
+            filesList.push_back(readerConfig("wfile"));
+            if (fexists(readerConfig("wfile")))
+                found = true;
+        }
+        FindConfigNames(readerConfig, "wfile", names);
+        for (auto name : names)
+        {
+            ConfigParameters config = readerConfig(name);
+            filesList.push_back(config("wfile"));
+            if (fexists(config("wfile")))
+                found = true;
+        }
+
+        // if we have a file already, we are going to read the cached files
+        if (found)
+        {
+            ConfigParameters config;
+            readerConfig.CopyTo(config);
+            // mmodify the config so the reader types look correct
+            config["readerType"] = config("writerType");
+            config["file"] = filesList;
+            m_cachingReader = new DataReader<ElemType>(config);
+        }
+        else
+        {
+            m_cachingWriter = new DataWriter<ElemType>(readerConfig);
+
+            // now get the section names for map and category types
+            std::map<std::wstring, SectionType, nocase_compare> sections;
+            m_cachingWriter->GetSections(sections);
+            for (auto pair : sections)
+            {
+                // TODO: we would need to add a sequenceMap type here as well
+                // or maybe change to heirarchal name (i.e. root.labelIn.map)
+                if (pair.second == sectionTypeCategoryLabel)
+                {
+                    m_labelsCategoryName[labelInfoOut] = pair.first;
+                }
+                else if (pair.second == sectionTypeLabelMapping)
+                {
+                    m_labelsMapName[labelInfoOut] = pair.first;
+                }
+            }
+        }
+    }
+    catch (runtime_error err)
+    {
+        fprintf(stderr,"Error attemping to create Binary%s\n%s\n",found?"Reader":"Writer",err.what());
+        delete m_cachingReader;
+        m_cachingReader = NULL;
+        delete m_cachingWriter;
+        m_cachingWriter = NULL;
+    }
+    catch (...)
+    {
+        // if there is any error, just get rid of the object
+        fprintf(stderr,"Error attemping to create Binary%s\n",found?"Reader":"Writer");
+        delete m_cachingReader;
+        m_cachingReader = NULL;
+        delete m_cachingWriter;
+        m_cachingWriter = NULL;
+    }
+}
+
+// destructor - virtual so it gets called properly 
+template<class ElemType>
+SequenceReader<ElemType>::~SequenceReader()
+{
+    ReleaseMemory();
+    delete m_cachingReader;
+    delete m_cachingWriter;
+}
+
+// ReleaseMemory - release the memory footprint of SequenceReader
+// used when the caching reader is taking over
+template<class ElemType>
+void SequenceReader<ElemType>::ReleaseMemory()
+{
+    if (m_featuresBuffer!=NULL)
+        delete[] m_featuresBuffer;
+    m_featuresBuffer=NULL;
+    if (m_labelsBuffer!=NULL)
+        delete[] m_labelsBuffer;
+    m_labelsBuffer=NULL;
+    if (m_labelsIdBuffer!=NULL)
+        delete[] m_labelsIdBuffer;
+    m_labelsIdBuffer=NULL;
+    m_featureData.clear();
+    m_labelIdData.clear();
+    m_labelData.clear();
+    m_sequence.clear();
+}
+
+//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch
+template<class ElemType>
+void SequenceReader<ElemType>::SetupEpoch()
+{
+    // if we are starting fresh (epoch zero and no data read), init everything
+    // however if we are using cachingWriter, we need to know record count, so do that first
+    if (m_epoch == 0 && m_totalSamples == 0 && m_cachingWriter == NULL)
+    {
+        m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0;
+        m_parser.SetFilePosition(0);
+    }
+    else  // otherwise, position the read to start at the right location
+    {
+        m_seqIndex = 0;
+        // don't know the total number of samples yet, so count them
+        if (m_totalSamples == 0)
+        {
+            if (m_traceLevel > 0)
+                fprintf(stderr, "starting at epoch %d parsing all data to determine record count\n", m_epoch);
+            // choose a large number to read
+            m_parser.SetFilePosition(0);
+            m_mbStartSample = 0;
+            while (EnsureDataAvailable(m_mbStartSample))
+            {
+                m_mbStartSample = m_totalSamples;
+                m_seqIndex = m_sequence.size();
+            }
+            if (m_traceLevel > 0)
+                fprintf(stderr, "\n %lld records found\n", m_totalSamples);
+        }
+        m_seqIndex = 0;
+
+        // we have a slight delima here, if we haven't determined the end of the file yet
+        // and the user told us to find how many records are in the file, we can't distinguish "almost done"
+        // with a file (a character away) and the middle of the file. So read ahead a record to see if it's there.
+        bool endReached = m_endReached;
+        if (!endReached)
+        {
+            if (!m_parser.HasMoreData())
+            {
+                endReached = true;
+                UpdateDataVariables();
+                assert(m_endReached);
+            }
+        }
+
+        // always start from the first sample
+        m_epochStartSample = m_mbStartSample = 0;
+    }
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::LMSetupEpoch()
+{
+    m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0;
+}
+
+// utility function to round an integer up to a multiple of size
+size_t RoundUp(size_t value, size_t size) 
+{
+    return ((value + size -1)/size)*size;
+}
+
+//StartMinibatchLoop - Startup a minibatch loop 
+// mbSize - [in] size of the minibatch (number of Samples, etc.)
+//     NOTE: for sequence data, this will be the MAX size of a sequence, as every sequence could be a different length
+// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run)
+// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
+template<class ElemType>
+void SequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+{
+    // if we aren't currently caching, see if we can use a cache
+    if (!m_cachingReader && !m_cachingWriter)
+    {
+        InitCache(m_readerConfig);
+        if (m_cachingReader)
+            ReleaseMemory();    // free the memory used by the SequenceReader
+    }
+
+    // if we are reading from the cache, do so now and return
+    if (m_cachingReader)
+    {
+        m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
+        return;
+    } 
+
+    if (m_featuresBuffer==NULL)
+    {
+        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+        m_featuresBuffer = new ElemType[mbSize*labelInfo.dim];
+        memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim);
+    }
+
+    if (m_labelsBuffer==NULL)
+    {
+        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+        if (labelInfo.type == labelCategory)
+        {
+            m_labelsBuffer = new ElemType[labelInfo.dim*mbSize];
+            memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize);
+            m_labelsIdBuffer = new IDataReader<ElemType>::LabelIdType[mbSize];
+            memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*mbSize);
+        }
+        else if (labelInfo.type != labelNone)
+        {
+            m_labelsBuffer = new ElemType[mbSize];
+            memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize);
+            m_labelsIdBuffer = NULL;
+        }
+    }
+
+    m_mbSize = mbSize;
+    if (requestedEpochSamples == requestDataSize)
+    {
+        if (!m_endReached)
+        {
+            m_epochSize = requestDataSize;
+        }
+    }
+    else
+    {
+        m_epochSize = requestedEpochSamples;
+    }
+    
+    // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
+    size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize;
+    m_epoch = epoch;
+    m_mbStartSample = epoch*m_epochSize;
+
+    // allocate room for the data
+    m_featureData.reserve(m_featureCount*epochSize);
+    if (m_labelInfo[labelInfoOut].type == labelCategory)
+        m_labelIdData.reserve(epochSize);
+    else if (m_labelInfo[labelInfoOut].type != labelNone)
+        m_labelData.reserve(epochSize);
+    m_sequence.reserve(m_seqIndex); // clear out the sequence array
+    /// this is too complicated for LM 
+    // SetupEpoch(); 
+    /// use the LMSetupEpoch() instead
+    LMSetupEpoch();
+
+    m_clsinfoRead = false; 
+    m_idx2clsRead = false; 
+
+    m_parser.ParseReset(); 
+}
+
+template<class ElemType>
+bool SequenceReader<ElemType>::DataEnd(EndDataType endDataType)
+{
+    bool ret = false;
+    switch (endDataType)
+    {
+    case endDataNull:
+        assert(false);
+        break;
+    case endDataEpoch:
+        ret = m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1];
+        break;
+    case endDataSet:
+        ret = !EnsureDataAvailable(m_mbStartSample);
+        break;
+    case endDataSentence:  // for fast reader each minibatch is considered a "sentence", so always true
+        ret = SentenceEnd();
+        break;
+    }
+    return ret;
+}
+
+
+template<class ElemType>
+bool SequenceReader<ElemType>::SentenceEnd()
+{
+    // this is after getMinibatch size, which has increased m_seqIndex by 1
+    // so the real index is m_seqIndex - 1; 
+    int seqIndex = (int)m_seqIndex - 1; 
+
+    // now get the labels
+    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+
+    size_t actualmbsize = 0;
+
+    // figure out the size of the next sequence
+    if (seqIndex > 0)
+    {
+        actualmbsize = m_sequence[seqIndex] - m_sequence[seqIndex-1];   
+    }
+    else
+    {
+        actualmbsize = m_sequence[0];
+    }
+
+    if (actualmbsize < m_mbSize)
+        return true;
+
+    size_t jEnd = m_sequence[seqIndex]-1;
+         
+    if (labelInfo.type == labelCategory)
+    {
+        LabelIdType index ;
+        if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false)
+            RuntimeError("cannot find sentence begining label");
+
+        if (m_labelIdData[jEnd] == index )
+            return true; 
+        else 
+            return false; 
+    }
+    return false; 
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, Matrix<ElemType>*>& matrices, 
+                                              size_t m_mbStartSample, size_t actualmbsize)
+{
+    size_t j = 0;
+    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
+    if (labels == nullptr) return;
+    
+    labels->Resize(nwords + class_size, actualmbsize, false);
+        
+    for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
+    {
+        // pick the right sample with randomization if desired
+        size_t jRand = jSample;
+         
+        int    wrd = m_labelIdData[jRand];
+        int    clsidx = idx4class[wrd]; 
+        
+        labels->SetValue(wrd, j, 1); 
+
+        if (class_size > 0)
+            labels->SetValue(nwords + clsidx, j, 1); 
+    }
+
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::GetInputToClass(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+    Matrix<ElemType>* idx2cls= matrices[STRIDX2CLS];
+    if (idx2cls== nullptr) return;
+
+    if (m_idx2clsRead) return;
+
+    // populate local CPU matrix
+    m_id2classLocal->SwitchToMatrixType(MatrixType::DENSE);
+    m_id2classLocal->Resize(nwords , 1, false);        
+
+    //move to CPU since element-wise operation is expensive and can go wrong in GPU
+    int curDevId = m_id2classLocal->GetDeviceId();
+    m_id2classLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
+    for (size_t j = 0; j < nwords ; j++) 
+    {
+        int clsidx = idx4class[(int)j];
+        (*m_id2classLocal)(j,0) = (float)clsidx; 
+    }
+    m_id2classLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+
+    int oldDeviceId = idx2cls->GetDeviceId();
+    // caution, SetValue changes idx2cls from GPU to CPU, may change this behavior later
+    idx2cls->SetValue(*m_id2classLocal); 
+    idx2cls->TransferFromDeviceToDevice(idx2cls->GetDeviceId(), oldDeviceId, true);
+    
+    m_idx2clsRead = true;
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::GetClassInfo(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+    Matrix<ElemType>* clsinfo = matrices[CLASSINFO];
+    if (clsinfo == nullptr) return;
+
+    if (m_clsinfoRead) return;
+
+    // populate local CPU matrix
+    m_classInfoLocal->SwitchToMatrixType(MatrixType::DENSE);
+    m_classInfoLocal->Resize(2, class_size);        
+
+    //move to CPU since element-wise operation is expensive and can go wrong in GPU
+    int curDevId = m_classInfoLocal->GetDeviceId();
+    m_classInfoLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
+
+    int clsidx; 
+    int prvcls = -1;
+    for (size_t j = 0; j < nwords; j++) 
+    {
+        clsidx = idx4class[(int)j]; 
+        if (prvcls != clsidx)
+        {
+            if (prvcls >= 0)
+                (*m_classInfoLocal)(1, prvcls) = (float)j;
+            prvcls = clsidx;
+            (*m_classInfoLocal)(0, prvcls) = (float)j;
+        }
+    }
+    (*m_classInfoLocal)(1, prvcls) = (float)nwords;
+
+    m_classInfoLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+
+    int oldDeviceId = clsinfo->GetDeviceId();
+    // caution, SetValue changes m_classInfoLocal from GPU to CPU, may change this behavior later
+    clsinfo->SetValue(*m_classInfoLocal); 
+    clsinfo->TransferFromDeviceToDevice(clsinfo->GetDeviceId(), oldDeviceId, true);
+
+    m_clsinfoRead = true;
+}
+
+template<class ElemType>
+bool SequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+
+    // get out if they didn't call StartMinibatchLoop() first
+    if (m_mbSize == 0)
+        return false;
+
+    // check to see if we have changed epochs, if so we are done with this one.
+    if (m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1])
+        return false;
+
+    bool moreData = EnsureDataAvailable(m_mbStartSample);
+    if (moreData == false)
+        return false; 
+
+    // figure which sweep of the randomization we are on
+    size_t recordStart = m_totalSamples?m_mbStartSample%m_totalSamples:m_mbStartSample;
+
+    // actual size is the size of the next seqence
+    size_t actualmbsize = 0;
+
+    // figure out the size of the next sequence
+    if (m_seqIndex > 0 && m_seqIndex < m_sequence.size() && m_sequence.size() > 1)
+    {
+        actualmbsize = m_sequence[m_seqIndex] - m_sequence[m_seqIndex-1];   
+    }
+    else
+    {
+        actualmbsize = m_sequence[0];
+    }
+
+    if (actualmbsize > m_mbSize){
+        RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize);
+    }
+
+    // hit the end of the dataset, 
+    if (!moreData)
+    {
+        // make sure we take into account hitting the end of the dataset (not wrapping around)
+        actualmbsize = min(m_totalSamples-recordStart,actualmbsize);
+    }
+
+    // now get the labels
+    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+
+    if (labelInfo.type == labelCategory)
+    {
+        memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*actualmbsize);
+        memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*actualmbsize);
+    }
+    else if (labelInfo.type != labelNone)
+    {
+        memset(m_labelsBuffer,0,sizeof(ElemType)*1*actualmbsize);        
+    }
+
+    if (actualmbsize > 0)
+    {
+
+        memset(m_featuresBuffer, 0, sizeof(ElemType)*actualmbsize*labelInfo.dim);
+
+        //loop through all the samples
+        int j = 0;
+        Matrix<ElemType>& features = *matrices[m_featuresName];
+        if (matrices.find(m_featuresName) != matrices.end())
+        {
+            if(features.GetMatrixType() == MatrixType::DENSE) 
+            {
+                features.Resize(labelInfo.dim, actualmbsize, false);
+                features.SetValue(0);
+            }
+            else
+            {
+                features.Resize(labelInfo.dim, actualmbsize);
+                features.Reset();
+            }
+        }
+
+        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
+        {
+            // pick the right sample with randomization if desired
+            size_t jRand = jSample;
+         
+            // vector of feature data goes into matrix column
+            size_t idx = (size_t)m_featureData[jRand];
+            m_featuresBuffer[j*labelInfo.dim + idx] = (ElemType)1; 
+
+            if (matrices.find(m_featuresName) != matrices.end())
+                features.SetValue(idx, j, (ElemType)1); 
+        }
+
+        GetLabelOutput(matrices, m_mbStartSample, actualmbsize);
+        GetInputToClass(matrices);
+        GetClassInfo(matrices);
+
+        // make sure that the sequence index matches our end index
+        assert(m_sequence[m_seqIndex] == m_mbStartSample+actualmbsize);
+        // go to the next sequence
+        m_seqIndex++;
+    } 
+
+    // advance to the next minibatch
+    m_mbStartSample += actualmbsize;
+
+    // if they don't want partial minibatches, skip data transfer and return
+    if (actualmbsize == 0) // no records found (end of minibatch)
+    {
+        return false;
+    }
+
+    // now transfer to the GPU as needed
+    try{
+        // get the features array
+        if (matrices.find(m_featuresName) == matrices.end())
+        {
+            Matrix<ElemType>& nbs = *matrices[L"numberobs"];
+            int curDevId = nbs.GetDeviceId();
+            nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
+            nbs(0,0) = (float)actualmbsize;
+            nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+            for (size_t i = 0; i < actualmbsize; i++)
+            {
+                std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i);
+                Matrix<ElemType>& features = *matrices[ws];
+                features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal);
+            }
+        }
+    }catch(...)
+    {
+        RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition"  ,actualmbsize);
+    }
+
+    try
+    {
+        if (labelInfo.type == labelCategory)
+        {
+            if (matrices.find(m_labelsName[labelInfoOut]) == matrices.end())
+            {
+                for (size_t i = 0; i < actualmbsize; i++)
+                {
+                    std::wstring ws = msra::strfun::wstrprintf (L"label%d", i);
+                    Matrix<ElemType>* labels = matrices[ws]; 
+                    labels->SetValue(labelInfo.dim, 1, &m_labelsBuffer[i * labelInfo.dim],matrixFlagNormal);
+                }
+            }
+        }
+        else if (labelInfo.type != labelNone)
+        {
+            Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
+            labels->SetValue(1, actualmbsize,m_labelsBuffer,matrixFlagNormal);
+        }
+    }catch(...)
+    {
+        RuntimeError("cannot find matrices for %s", m_labelsName[labelInfoOut]);
+    }
+
+    // we read some records, so process them
+    return true;
+}
+
+template<class ElemType>
+void SequenceReader<ElemType>::OrganizeClass()
+{
+    //allocate auxiliary class variables (for faster search when normalizing probability at output layer)
+    int cl, i;
+    for (i=0; i<class_size; i++) {
+        class_cn.push_back(0); 
+    }
+
+    for (i=0; i<nwords; i++) {
+        cl=idx4class[i];
+        class_words[cl].push_back(i); 
+        class_cn[cl]++;
+    }
+
+    for (i=0; i<class_size; i++) {
+        if (class_cn[i] == 0) {
+            RuntimeError ("class is empty");
+        }
+    }
+}
+
+// GetLabelMapping - Gets the label mapping from integer index to label type 
+// returns - a map from numeric datatype to native label type 
+template<class ElemType>
+const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& SequenceReader<ElemType>::GetLabelMapping(const std::wstring& sectionName)
+{
+    if (m_cachingReader)
+    {
+        return m_cachingReader->GetLabelMapping(sectionName);
+    }
+    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+
+    return labelInfo.mapIdToLabel;
+}
+
+// SetLabelMapping - Sets the label mapping from integer index to label 
+// labelMapping - mapping table from label values to IDs (must be 0-n)
+// note: for tasks with labels, the mapping table must be the same between a training run and a testing run 
+template<class ElemType>
+void SequenceReader<ElemType>::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename IDataReader<ElemType>::LabelIdType, typename LabelType>& labelMapping)
+{
+    if (m_cachingReader)
+    {
+        RuntimeError("Cannot set mapping table when the caching reader is being used");
+    }
+    LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+
+    labelInfo.mapIdToLabel = labelMapping;
+    labelInfo.mapLabelToId.clear();
+    for (std::pair<unsigned, LabelType> var : labelMapping)
+    {
+        labelInfo.mapLabelToId[var.second] = var.first;
+    }
+}
+
+// GetData - Gets metadata from the specified section (into CPU memory) 
+// sectionName - section name to retrieve data from
+// numRecords - number of records to read
+// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
+// dataBufferSize - [in] size of the databuffer in bytes
+//                  [out] size of buffer filled with data
+// recordStart - record to start reading from, defaults to zero (start of data)
+// returns: true if data remains to be read, false if the end of data was reached
+template<class ElemType>
+bool SequenceReader<ElemType>::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart)
+{
+    if (!m_cachingReader)
+        RuntimeError("GetData not supported in SequenceReader");
+    return m_cachingReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart);
+}
+
+// instantiate all the combinations we expect to be used
+template class SequenceReader<double>; 
+template class SequenceReader<float>;
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)
+{
+    // See if the user wants caching
+    m_cachingReader = NULL;
+    m_cachingWriter = NULL;
+
+    // NOTE: probably want to re-enable at some point
+
+    // initialize the cache
+    //InitCache(readerConfig);
+    //m_readerConfig = readerConfig;
+
+    //// if we have a cache, no need to parse the test files...
+    //if (m_cachingReader)
+    //    return;
+
+    std::vector<std::wstring> features;
+    std::vector<std::wstring> labels;
+    GetFileConfigNames(readerConfig, features, labels);
+    if (features.size() > 0)
+    {
+        m_featuresName = features[0];
+    }
+
+    if (labels.size() == 2)
+    {
+        for (int index = labelInfoMin; index < labelInfoMax; ++index)
+        {
+            m_labelsName[index] = labels[index];
+        }
+    }
+    else
+        RuntimeError("two label definitions (in and out) required for Sequence Reader");
+
+    ConfigParameters featureConfig = readerConfig(m_featuresName,"");
+    ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")};
+
+    class_size = 0;
+    m_featureDim = featureConfig("dim");
+    for (int index = labelInfoMin; index < labelInfoMax; ++index)
+    {
+        m_labelInfo[index].idMax = 0; 
+        m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", "");
+        m_labelInfo[index].endSequence = labelConfig[index]("endSequence", "");
+
+        // determine label type desired
+        std::string labelType(labelConfig[index]("labelType","Category"));
+        if (labelType == "Category")
+        {
+            m_labelInfo[index].type = labelCategory;
+        }
+        else if (labelType == "NextWord")
+        {
+            // in this case, it's all identical to the Input labels, except the data type
+            m_labelInfo[index].type = labelNextWord;
+            m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim;
+        }
+        else if (labelType == "None")
+        {
+            m_labelInfo[index].type = labelNone;
+            m_labelInfo[index].dim = 0;   // override for no labels
+        }
+        
+        // if we have labels, we need a label Mapping file, it will be a file with one label per line
+        if (m_labelInfo[index].type != labelNone)
+        {
+            std::wstring wClassFile = readerConfig("wordclass", "");
+            nwords = labelConfig[index]("labelDim");
+            if (wClassFile != L""){
+                ReadClassInfo(wClassFile  , false);
+            }
+
+            std::vector<string> arrayLabels;
+            std::wstring labelPath = labelConfig[index]("labelMappingFile");
+            if (fexists(labelPath))
+            {
+                LoadLabelFile(labelPath, arrayLabels);
+                for (int i=0; i < arrayLabels.size(); ++i)
+                {
+                    LabelType label = arrayLabels[i];
+                    m_labelInfo[index].mapIdToLabel[i] = label;
+                    m_labelInfo[index].mapLabelToId[label] = i;
+                }
+                m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size();
+                m_labelInfo[index].mapName = labelPath;
+            }
+            else
+            {
+                if (wClassFile != L""){
+                    ReadClassInfo(wClassFile  , false);
+                    int iMax = -1, i; 
+                    for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++)
+                    {
+                        LabelType label = ptr->first; 
+                        i = ptr->second; 
+                        iMax = max(i, iMax);
+                        m_labelInfo[index].mapIdToLabel[i] = label;
+                        m_labelInfo[index].mapLabelToId[label] = i;
+                    }
+                    m_labelInfo[index].idMax = (LabelIdType)(iMax+1);
+
+                    OrganizeClass();
+
+                }
+                m_labelInfo[index].mapName = labelPath;
+
+                m_labelInfo[index].fileToWrite = labelPath;
+            }
+        }
+
+        m_labelInfo[index].dim = labelConfig[index]("labelDim");
+
+        // update dimension if the file says it's bigger
+        if (m_labelInfo[index].dim < m_labelInfo[index].idMax)
+        {
+            m_labelInfo[index].dim = m_labelInfo[index].idMax;
+        }
+    }
+
+    // initialize all the variables
+    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0;
+    m_endReached = false;
+    m_readNextSampleLine = 0;
+    m_readNextSample = 0;
+    m_traceLevel = readerConfig("traceLevel","0");
+    m_parser.SetTraceLevel(m_traceLevel);
+
+    if (readerConfig.Exists("randomize"))
+    {
+        string randomizeString = readerConfig("randomize");
+        if (randomizeString == "None")
+        {
+            ;
+        }
+        else if (randomizeString == "Auto")
+        {
+            ;
+        }
+        else
+        {
+            ;//readerConfig("randomize");
+        }
+    }
+    else
+    {
+        ; //randomizeAuto;
+    }
+
+    // The input data is a combination of the label Data and extra feature dims together
+//    m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim;
+    m_featureCount = 1; 
+
+    std::wstring m_file = readerConfig("file");
+    if (m_traceLevel > 0)
+        fprintf(stderr, "reading sequence file %ws\n", m_file.c_str());
+
+    const LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+    const LabelInfo& labelOut = m_labelInfo[labelInfoOut];
+    m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence);
+
+    mBlgSize = readerConfig("nbruttsineachrecurrentiter", "1");
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::Reset()
+{
+    mProcessed.clear();
+    mToProcess.clear();
+    mLastProcssedSentenceId = 0;
+    mPosInSentence = 0;
+    mLastPosInSentence = 0;
+    mNumRead = 0;
+
+    if (m_labelTemp.size() > 0)
+        m_labelTemp.clear();
+    if (m_featureTemp.size() > 0)
+        m_featureTemp.clear();
+    m_parser.mSentenceIndex2SentenceInfo.clear();
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+{
+    // if we aren't currently caching, see if we can use a cache
+    if (!m_cachingReader && !m_cachingWriter)
+    {
+        InitCache(m_readerConfig);
+        if (m_cachingReader)
+            ReleaseMemory();    // free the memory used by the SequenceReader
+    }
+
+    // if we are reading from the cache, do so now and return
+    if (m_cachingReader)
+    {
+        m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
+        return;
+    } 
+
+    if (m_featuresBuffer==NULL)
+    {
+        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+        m_featuresBuffer = new ElemType[mbSize*labelInfo.dim];
+        memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim);
+    }
+
+    if (m_labelsBuffer==NULL)
+    {
+        const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+        if (labelInfo.type == labelCategory)
+        {
+            m_labelsBuffer = new ElemType[labelInfo.dim*mbSize];
+            memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize);
+            m_labelsIdBuffer = new IDataReader<ElemType>::LabelIdType[mbSize];
+            memset(m_labelsIdBuffer,0,sizeof(IDataReader<ElemType>::LabelIdType)*mbSize);
+        }
+        else if (labelInfo.type != labelNone)
+        {
+            m_labelsBuffer = new ElemType[mbSize];
+            memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize);
+            m_labelsIdBuffer = NULL;
+        }
+    }      
+
+    m_featuresBufferRow = new size_t[mbSize];
+    m_featuresBufferRowIdx = new size_t[mbSize];
+
+    m_labelsIdBufferRow = new CPUSPARSE_INDEX_TYPE[2 * mbSize];
+    m_labelsBlock2Id = new size_t[2*mbSize];
+    m_labelsBlock2UniqId = new size_t[2*mbSize];
+
+    m_id2classLocal = new Matrix<ElemType>(CPUDEVICE);
+    m_classInfoLocal = new Matrix<ElemType>(CPUDEVICE);
+        
+    m_mbSize = mbSize;
+    if (requestedEpochSamples == requestDataSize)
+    {
+        if (!m_endReached)
+        {
+            m_epochSize = requestDataSize;
+        }
+    }
+    else
+    {
+        m_epochSize = requestedEpochSamples;
+    }
+    
+    // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
+    size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize;
+    m_epoch = epoch;
+    m_mbStartSample = epoch*m_epochSize;
+
+    // allocate room for the data
+    m_featureData.reserve(m_featureCount*epochSize);
+    if (m_labelInfo[labelInfoOut].type == labelCategory)
+        m_labelIdData.reserve(epochSize); 
+    else if (m_labelInfo[labelInfoOut].type != labelNone)
+        m_labelData.reserve(epochSize);
+    m_sequence.reserve(m_seqIndex); // clear out the sequence array
+    /// this is too complicated for LM 
+    // SetupEpoch(); 
+    /// use the LMSetupEpoch() instead
+    LMSetupEpoch();
+
+    m_clsinfoRead = false; 
+    m_idx2clsRead = false; 
+
+    m_parser.ParseReset(); 
+
+    Reset();
+}
+
+template<class ElemType>
+size_t BatchSequenceReader<ElemType>::FindNextSentences(size_t numRead)
+{  
+    size_t sln = 0;
+
+    if (numRead == 0) return 0;
+
+    if (mProcessed.size() == 0)
+    {
+        mProcessed.resize(numRead, false);
+    }
+
+    if (mToProcess.size() > 0)
+    {
+        bool allDone = false; 
+        for (int s = 0; s < mToProcess.size(); s++)
+        {
+            int mp = (int)mToProcess[s];
+            if (mProcessed[mp])
+            {
+                mLastProcssedSentenceId = mp;
+                mLastPosInSentence = 0;
+                allDone = true;
+                break;
+            }
+        }
+        if (allDone)
+        {
+            mToProcess.clear();
+        }
+    }
+
+    if (mToProcess.size() > 0)
+    {
+        sln = m_parser.mSentenceIndex2SentenceInfo[mToProcess[0]].sLen;
+        return sln;
+    }
+
+    for (size_t seq = mLastProcssedSentenceId ; seq < numRead; seq++)
+    {
+        if (mProcessed[seq]) continue;
+        
+        if (sln == 0)
+        {
+            sln = m_parser.mSentenceIndex2SentenceInfo[seq].sLen;
+        }
+        if (sln == m_parser.mSentenceIndex2SentenceInfo[seq].sLen &&
+            mProcessed[seq] == false && mToProcess.size() < mBlgSize)
+            mToProcess.push_back(seq);
+
+        if (mToProcess.size() == mBlgSize) break;
+    }
+
+    return sln;
+}
+
+template<class ElemType>
+bool BatchSequenceReader<ElemType>::EnsureDataAvailable(size_t /*mbStartSample*/)
+{
+    bool bDataIsThere = true; 
+
+    m_featureData.clear();
+    m_labelIdData.clear();
+
+    // now get the labels
+    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+
+    bool nextWord = false;
+    if (m_labelInfo[labelInfoOut].type == labelNextWord)
+    {
+        nextWord = true;
+    }
+    LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut];
+
+    // see how many we already read
+    std::vector<SequencePosition> seqPos;
+    
+    size_t sLn = FindNextSentences(mNumRead);
+    if (sLn == 0)
+    {
+        Reset();
+
+        mNumRead = m_parser.Parse(CACHE_BLOG_SIZE, &m_labelTemp, &m_featureTemp, &seqPos);
+        if (mNumRead == 0) return false;
+
+        std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end());
+
+        m_readNextSampleLine += mNumRead;
+        sLn = FindNextSentences(mNumRead);
+    }
+
+    /// add one minibatch 
+    size_t i = mLastPosInSentence; 
+    size_t j = 0;
+    // exclude the last token since it is the last label to be predicted
+    for (i = mLastPosInSentence; j < m_mbSize &&  i < sLn-1; i++ , j++)
+    {
+        for (int k = 0; k < mToProcess.size(); k++)
+        {
+            size_t seq = mToProcess[k];
+            size_t label = m_parser.mSentenceIndex2SentenceInfo[seq].sBegin + i;
+
+            // labelIn should be a category label 
+            LabelType labelValue = m_labelTemp[label++];
+
+            // to-do, should ignore <s>, check the sentence ending is </s> 
+            // need to remove <s> from the training set
+            // allocate and initialize the next chunck of featureData
+            if (labelIn.type == labelCategory)
+            {
+                LabelIdType index = GetIdFromLabel(labelValue, labelIn);
+
+                // use the found value, and set the appropriate location to a 1.0
+                assert(labelIn.dim > index); // if this goes off labelOut dimension is too small
+                m_featureData.push_back((float)index);
+            }
+            else
+            {
+                RuntimeError("Input label expected to be a category label");
+            }
+
+            // now get the output label
+            if (m_labelInfo[labelInfoOut].type == labelCategory)
+            {
+                labelValue = m_labelTemp[label++];
+            }
+            else if (nextWord)
+            {
+                // this is the next word (label was incremented above)
+                labelValue = m_labelTemp[label];
+                if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()))
+                {
+                    labelValue = labelInfo.endSequence;
+                }
+            }
+            else
+            {
+                RuntimeError("Invalid output label type, expected Category, or Next Word");
+            }
+
+            // get the ID from the label
+            LabelIdType id = GetIdFromLabel(labelValue, labelInfo);
+            m_labelIdData.push_back(id);
+
+            m_totalSamples ++;
+        }
+    }
+
+    mLastPosInSentence = i;
+
+    return bDataIsThere;
+}
+
+template<class ElemType>
+size_t BatchSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
+{
+    size_t sz = mToProcess.size();
+    return sz; 
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::SetNbrSlicesEachRecurrentIter(const size_t mz)
+{
+    mBlgSize = mz;
+}
+
+template<class ElemType>
+bool BatchSequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+
+    // get out if they didn't call StartMinibatchLoop() first
+    if (m_mbSize == 0)
+        return false;
+
+    bool moreData = EnsureDataAvailable(m_mbStartSample);
+    if (moreData == false)
+        return false; 
+
+    // actual size is the size of the next seqence
+    size_t actualmbsize = 0;
+
+    // figure out the size of the next sequence
+    actualmbsize = m_labelIdData.size() ; 
+    if (actualmbsize > m_mbSize * mToProcess.size()){
+        RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize);
+    }
+
+    // now get the labels
+    const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut];
+
+    if (actualmbsize > 0)
+    {
+
+        //loop through all the samples
+        Matrix<ElemType>& features = *matrices[m_featuresName];
+      
+        // copy m_featureData to matrix
+        // we always copy it to cpu first and then convert to gpu if gpu is desired.
+        DEVICEID_TYPE featureDeviceId = features.GetDeviceId();
+        features.TransferFromDeviceToDevice(featureDeviceId, CPUDEVICE, false, true, false);
+
+        if (features.GetMatrixType() == MatrixType::DENSE)
+        {
+            features.Resize(labelInfo.dim, actualmbsize);
+            features.SetValue(0);
+        }
+        else
+        {
+            features.Resize(labelInfo.dim, actualmbsize, actualmbsize);
+            features.Reset();
+        }
+
+        for (size_t j = 0; j < actualmbsize; ++j)
+        {
+            // vector of feature data goes into matrix column
+            size_t idx = (size_t)m_featureData[j];
+
+            //if (matrices.find(m_featuresName) != matrices.end())
+                features.SetValue(idx, j, (ElemType)1);
+        }
+        
+        features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false);
+
+        //else // for GPU
+        //{
+        //    if (matrices.find(m_featuresName) != matrices.end())
+        //    {
+        //        m_indexer.clear();
+        //        size_t size = m_featureData.size();
+
+        //        for(int i = 0; i < size; i++) 
+        //        {
+        //            m_featuresBufferRow[i] = (size_t)m_featureData[i];                    
+        //            if(m_indexer.find(m_featuresBufferRow[i]) == m_indexer.end()) 
+        //            {
+        //                m_indexer[m_featuresBufferRow[i]] = m_indexer.size();
+        //            }
+        //            m_featuresBufferRowIdx[i] = m_indexer[m_featuresBufferRow[i]];
+        //        }               
+        //        features.SetMatrixFromCSCFormat(m_featuresBufferRow, m_featuresBufferRowIdx, size, m_indexer.size());
+        //    }
+        //}
+                
+        // TODO: move these two methods to startMiniBatchLoop()
+        GetInputToClass(matrices);
+        GetClassInfo(matrices);
+        GetLabelOutput(matrices, 0, actualmbsize);
+
+        // go to the next sequence
+        m_seqIndex++;
+    } 
+    else
+        return false; 
+
+    // now transfer to the GPU as needed
+    try{
+        // get the features array
+        if (matrices.find(m_featuresName) == matrices.end())
+        {
+            Matrix<ElemType>& nbs = *matrices[L"numberobs"];
+            int curDevId = nbs.GetDeviceId();
+            nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
+            nbs(0,0) = (float)actualmbsize;
+            nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+            for (size_t i = 0; i < actualmbsize; i++)
+            {
+                std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i);
+                Matrix<ElemType>& features = *matrices[ws];
+                features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal);
+            }
+        }
+    }catch(...)
+    {
+        RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition"  ,actualmbsize);
+    }
+
+    // we read some records, so process them
+    return true;
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::SetSentenceEnd(int wrd, int pos, int actualMbSize)
+{
+    // now get the labels
+    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+    LabelIdType index = GetIdFromLabel(labelIn.endSequence.c_str(), labelIn);
+
+    if (pos == actualMbSize - 1) 
+    {
+        if (wrd == (int)index)
+            mSentenceEnd = true;
+        else
+            mSentenceEnd = false; 
+    }
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::SetSentenceBegin(int wrd, int pos, int /*actualMbSize*/)
+{
+    // now get the labels
+    LabelInfo& labelIn = m_labelInfo[labelInfoIn];
+    LabelIdType index = GetIdFromLabel(labelIn.beginSequence.c_str(), labelIn);
+
+    if (pos == 0) 
+    {
+        if (wrd == (int)index)
+            mSentenceBegin = true;
+        else
+            mSentenceBegin = false; 
+    }
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::SetSentenceEndInBatch(vector<size_t> &sentenceEnd)
+{
+    sentenceEnd.resize(mToProcess.size());
+    if (mSentenceBegin)
+    {
+        sentenceEnd.assign(mToProcess.size(), 0);
+    }
+    else
+    {
+        sentenceEnd.assign(mToProcess.size(), m_mbSize+2);
+    }
+}
+
+template<class ElemType>
+bool BatchSequenceReader<ElemType>::DataEnd(EndDataType endDataType)
+{
+    bool ret = false;
+    switch (endDataType)
+    {
+    case endDataNull:
+        assert(false);
+        break;
+    case endDataEpoch:
+    case endDataSet:
+        ret = !EnsureDataAvailable(m_mbStartSample);
+        break;
+    case endDataSentence:  // for fast reader each minibatch is considered a "sentence", so always true
+        if (mSentenceEnd)
+        {
+            for (auto ptr = mToProcess.begin(); ptr != mToProcess.end(); ptr++)
+                mProcessed[*ptr] = true;
+        }
+        ret = mSentenceEnd;
+        break;
+    }
+    return ret;
+
+}
+
+template<class ElemType>
+void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, 
+    Matrix<ElemType>*>& matrices, 
+    size_t m_mbStartSample, size_t actualmbsize)
+{
+    size_t j = 0;
+    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]]; 
+    if (labels == nullptr) return;
+    
+    if(labels->GetMatrixType() == MatrixType::DENSE) 
+    {
+        labels->Resize(nwords + class_size, actualmbsize, false);
+        labels->SetValue(0);
+    }
+    else 
+    {
+        labels->Resize(nwords + class_size, actualmbsize, 2*actualmbsize);
+        labels->Reset();
+    }
+
+    if(labels->GetCurrentMatrixLocation() == CPU) {
+        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
+        {
+            // pick the right sample with randomization if desired
+            size_t jRand = jSample;
+
+            int    wrd = m_labelIdData[jRand];
+            int    clsidx = idx4class[wrd]; 
+
+            labels->SetValue(wrd, j, 1); 
+
+            SetSentenceEnd(wrd, j, actualmbsize);
+            SetSentenceBegin(wrd, j, actualmbsize);
+
+            if (class_size > 0)
+                labels->SetValue(nwords + clsidx, j, 1); 
+        }
+    }
+    else // GPU
+    {
+        m_indexer.clear();
+        int p = 0;
+        int b = 0; 
+        int nz = 0;
+        
+        for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
+        {
+            // pick the right sample with randomization if desired
+            size_t jRand = jSample;         
+            int    wrd = m_labelIdData[jRand];
+            int    clsidx = idx4class[wrd];         
+            SetSentenceEnd(wrd, j, actualmbsize);
+            SetSentenceBegin(wrd, j, actualmbsize);
+
+            int start[2];
+            int end[2];
+            int target[2];
+            int blockId[2];
+
+            start[0] = (int)(*m_classInfoLocal)(0, clsidx);
+            end[0] = (int)(*m_classInfoLocal)(1, clsidx);
+            target[0] = wrd;
+            blockId[0] = clsidx;
+            start[1] = nwords;
+            end[1] = nwords + (int)(*m_classInfoLocal).GetNumCols();
+            target[1] = nwords + clsidx;
+            blockId[1] = -1;
+
+            for(int i = 0; i < 2; i++) 
+            {
+                m_labelsIdBufferRow[p] = target[i];
+                int len = end[i] - start[i];
+                
+                if(m_indexer.find(blockId[i]) == m_indexer.end()) 
+                {
+                    m_indexer[blockId[i]] = b;                    
+                    b += len;
+                }   
+                m_labelsBlock2Id[p] = nz;
+                m_labelsBlock2UniqId[p] = m_indexer[blockId[i]];
+                nz += len;
+                p++;
+            }
+        }
+        
+        labels->SetMatrixFromLabelAndClass(m_labelsIdBufferRow, m_labelsBlock2Id, m_labelsBlock2UniqId, 2*actualmbsize, nz, b);
+    }
+}
+
+template class BatchSequenceReader<double>; 
+template class BatchSequenceReader<float>;
+
 }}}
\ No newline at end of file
diff --git a/MachineLearning/CNTKEval/CNTKEval.vcxproj b/MachineLearning/CNTKEval/CNTKEval.vcxproj
index f1dfe619a..2a981d6d6 100644
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj
@@ -60,7 +60,7 @@
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -79,7 +79,7 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -107,6 +107,7 @@
     <ClInclude Include="..\..\Common\Include\Eval.h" />
     <ClInclude Include="..\..\Common\Include\File.h" />
     <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
     <ClInclude Include="EvalReader.h" />
     <ClInclude Include="EvalWriter.h" />
     <ClInclude Include="stdafx.h" />
@@ -127,6 +128,7 @@
     <ClCompile Include="..\..\Common\fileutil.cpp">
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
     <ClCompile Include="..\cn\ComputationNode.cpp">
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
     </ClCompile>
diff --git a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
index f19953971..2c505bfdf 100644
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
@@ -1,50 +1,56 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClCompile Include="..\..\Common\BestGpu.cpp" />
-    <ClCompile Include="..\cn\ComputationNode.cpp" />
-    <ClCompile Include="..\cn\PTaskGraphBuilder.cpp" />
-    <ClCompile Include="dllmain.cpp" />
-    <ClCompile Include="stdafx.cpp" />
-    <ClCompile Include="CNTKEval.cpp" />
-    <ClCompile Include="..\..\Common\ConfigFile.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Eval.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="EvalReader.h" />
-    <ClInclude Include="EvalWriter.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
-    <ClInclude Include="CNTKEval.h" />
-    <ClInclude Include="..\..\Common\Include\Eval.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\basetypes.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\File.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\fileutil.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <Filter Include="Common">
-      <UniqueIdentifier>{bed53b47-70b1-494c-824d-0748362003b2}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Common\Include">
-      <UniqueIdentifier>{f3bf0104-8a08-40c9-a4d9-af8411c49669}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\BestGpu.cpp" />
+    <ClCompile Include="..\cn\ComputationNode.cpp" />
+    <ClCompile Include="..\cn\PTaskGraphBuilder.cpp" />
+    <ClCompile Include="dllmain.cpp" />
+    <ClCompile Include="stdafx.cpp" />
+    <ClCompile Include="CNTKEval.cpp" />
+    <ClCompile Include="..\..\Common\ConfigFile.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\Eval.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="EvalReader.h" />
+    <ClInclude Include="EvalWriter.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="CNTKEval.h" />
+    <ClInclude Include="..\..\Common\Include\Eval.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\basetypes.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\File.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\fileutil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Common">
+      <UniqueIdentifier>{bed53b47-70b1-494c-824d-0748362003b2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common\Include">
+      <UniqueIdentifier>{f3bf0104-8a08-40c9-a4d9-af8411c49669}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/MachineLearning/cn/ComputationNode.h b/MachineLearning/cn/ComputationNode.h
index 7ec090964..1012db450 100644
--- a/MachineLearning/cn/ComputationNode.h
+++ b/MachineLearning/cn/ComputationNode.h
@@ -3111,6 +3111,9 @@ protected:  \
             inputGradientValues.Print("child Gradient-in/out");
             inputFunctionValues.Print("child Function values");
 #endif
+            //currently we only support one combination when the input is sparse.
+            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol);
 
                 Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
 #if DUMPOUTPUT
diff --git a/MachineLearning/cn/NetworkDescriptionLanguage.h b/MachineLearning/cn/NetworkDescriptionLanguage.h
index e2e2fdfb3..cf158d785 100644
--- a/MachineLearning/cn/NetworkDescriptionLanguage.h
+++ b/MachineLearning/cn/NetworkDescriptionLanguage.h
@@ -1,1067 +1,1072 @@
-//
-// <copyright file="NetworkDescriptionLanguage.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-#include "commandArgUtil.h"
-#include "ComputationNode.h"
-#include "TrainingCriterionNode.h"
-#include "CompositeComputationNode.h"
-#include "EvaluationCriterionNode.h"
-#include "ComputationNetwork.h"
-#include <stdarg.h>
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// EqualInsensitive - check to see if two nodes are equal up to the length of the first string (must be at least half as long as actual node name)
-// string1 - [in,out] string to compare, if comparision is equal insensitive but not sensitive, will replace with sensitive version
-// string2 - second string to compare
-// alternate - alternate naming of the string
-// return - true if strings are equal insensitive and modifies string1 to sensitive version if different
-bool EqualInsensitive(std::wstring& string1, const std::wstring& string2, const wchar_t* alternate=NULL);
-
-// CheckFunction - check to see if we match a function name
-// string1 - [in,out] string to compare, if comparision is equal and at least half the full node name will replace with full node name
-// allowUndeterminedVariable - [out] set to true if undetermined variables (symbols yet to be defined) are allowed here
-// return - true if function name found
-template <typename ElemType>
-bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable=nullptr);
-
-// NDLType - Network Description Language node type
-enum NDLType
-{
-    ndlTypeNull,
-    ndlTypeConstant,
-    ndlTypeFunction,
-    ndlTypeVariable,
-    ndlTypeParameter, // parameter value, must be looked up to get actual value
-    ndlTypeUndetermined, // an undetermined value that will later be resolved
-    ndlTypeOptionalParameter,
-    ndlTypeArray,
-    ndlTypeMacroCall, // calling a macro
-    ndlTypeMacro, // definition of a macro
-    ndlTypeMax
-};
-
-// NDLPass - enumeration for the number of passes through the NDL parser
-enum NDLPass
-{
-    ndlPassInitial, // inital pass, create nodes
-    ndlPassResolve, // resolve any undetermined symbols (variables that were not yet declared in NDL)
-    ndlPassFinal, // final pass done post-validation (when all matrices are allocated to the correct size)
-    ndlPassAll = ndlPassFinal, // all passes, used as flag in NDLUtil.h
-    ndlPassMax // number of NDLPasses
-};
-
-// ++ operator for this enum, so loops work
-NDLPass &operator++(NDLPass &ndlPass);
-
-// Predeclaration of Script and Node
-template <typename ElemType>
-class NDLScript;
-
-template <typename ElemType>
-class NDLNode;
-
-// NDLNodeEvaluator - Node evaluaton interface
-// implemented by execution engines to convert script to approriate internal formats
-template <typename ElemType>
-class NDLNodeEvaluator 
-{
-public:
-    virtual void Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass) = 0;
-    virtual ~NDLNodeEvaluator() = 0;
-
-    // EvaluateParameter - Evaluate a parameter of a call
-    // node - NDLNode of the script
-    // nodeParam - NDLNode parameter we are evaluating
-    // baseName - name of the base node
-    // pass - which pass through the NDL nodes
-    // returns: the node that is the evaluated parameter
-    virtual NDLNode<ElemType>* EvaluateParameter(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseName, const NDLPass pass ) = 0;
-
-    // EvaluateParameters - Evaluate the parameters of a call
-    // node - NDLNode we are evaluating paramters for
-    // baseName - baseName for the current node
-    // nodeParamStart - starting parameter that contains a node
-    // nodeParamCount - ending parameter that contains a node
-    // pass - NDL pass we are evaluating
-    // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator
-    virtual std::vector<void*> EvaluateParameters(NDLNode<ElemType>* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) = 0;
-
-    // FindSymbol - Search the engines symbol table for a fully quantified symbol
-    // symbol - name of the symbol
-    // returns - pointer to the matching EvalValue for that node, of NULL if not found
-    virtual void* FindSymbol(const wstring& /*symbol*/)
-    {
-        return NULL;
-    }
-    // ProcessOptionalParameters - Process the optional parameters of a node
-    // node to process
-    virtual void ProcessOptionalParameters(NDLNode<ElemType>* /*node*/)
-    {
-        return;
-    }
-
-};
-
-template class NDLNodeEvaluator<float>; 
-template class NDLNodeEvaluator<double>;
-
-template <typename ElemType>
-class NetNdl // class to associate a network with an NDLScript
-{
-public:
-    ComputationNetwork<ElemType>* cn;
-    NDLScript<ElemType>* ndl;  // NDLScript we are using for this network. NOTE: the actual script used 
-    NDLNode<ElemType>* lastNode[ndlPassMax]; // last node we evaluated for each pass
-    NetNdl(): cn(nullptr), ndl(nullptr) {ClearLastNodes();}
-    NetNdl(ComputationNetwork<ElemType>*p_cn): cn(p_cn), ndl(nullptr) {ClearLastNodes();}
-    NetNdl(ComputationNetwork<ElemType>*p_cn, NDLScript<ElemType>* p_ndl): cn(p_cn), ndl(p_ndl) {ClearLastNodes();}
-    ~NetNdl()
-    {}
-
-    // ClearLastNodes - Clear out the last node values for all passes
-    void ClearLastNodes()
-    {
-        for (NDLPass pass=ndlPassInitial;pass < ndlPassMax;++pass)
-        {
-            lastNode[pass] = nullptr;
-        }
-    }
-
-    // Clear - clear out everything in the structure
-    // NOTE: this deletes the network and the NDLScript, use with care!
-    void Clear()
-    {
-        delete cn;
-        delete ndl;
-        cn = nullptr;
-        ndl = nullptr;
-        ClearLastNodes();
-    }
-};
-
-template <typename ElemType>
-inline NDLNodeEvaluator<ElemType>::~NDLNodeEvaluator() { }  // defined even though it's virtual; supposed to be faster this way
-
-// NDLNode - Network Description Language Node
-// Used to represent a named entity in the NDL
-// if a name is not provided (such as in nesting scenarios) one will be generated
-template <typename ElemType>
-class NDLNode
-{
-private:
-    std::string m_name; // value on the left of the equals
-    ConfigValue m_value; // value on the right of the equals (CN node name, or value)
-    NDLScript<ElemType>* m_parent; // parent script
-    NDLType m_type; //type of node
-    ConfigArray m_paramString; // parameter of a function/array
-    ConfigArray m_paramMacro; // parameter of a macro (the variables used in the macro definition)
-    vector<NDLNode*> m_parameters;  // parameters as nodes/array elements
-    void *m_eval; // pointer to an arbitrary eval structure
-    NDLScript<ElemType>* m_script; // script for ndlTypeMacro
-    static int s_nameCounter; // counter for generating unique names
-public:
-    NDLNode(const std::string& name, ConfigValue value, NDLScript<ElemType>* parent, NDLType ndlType)
-    {
-        if (name.empty())
-            GenerateName();
-        else
-            m_name = name;
-        m_value = value;
-        m_parent = parent;
-        assert(parent != NULL);
-        parent->AddChild(this);
-        m_type = ndlType;
-        m_eval = NULL;
-        m_script = NULL;
-    }
-
-    ~NDLNode()
-    {}
-
-    // publicly accessible Copy method
-    // should only be used for macro expansion
-    NDLNode* Copy() const
-    {
-        NDLNode* ret = new NDLNode(*this);
-        return ret;
-    }
-
-private:
-
-    // copy constructor, creates a new disconnected copy of this node for macro expansion
-    NDLNode(const NDLNode& copyMe);
-
-    NDLNode& operator=(NDLNode& /*copyMe*/)  //this is just a place holder implementation which is not functioning but prevent callers to use it.
-    {            
-        throw std::logic_error("'NDLNode& operator=(NDLNode& copyMe)' should never be called.");
-    } 
-
-    // generate a generic symbol name for a node
-    void GenerateName()
-    {
-        char buffer[10];
-        sprintf(buffer, "%d", ++s_nameCounter);
-        m_name = std::string("unnamed") + buffer;
-    }
-
-public:
-    void SetScript(NDLScript<ElemType>* script) {m_script = script;}
-    NDLScript<ElemType>* GetScript() const {return m_script;}
-    void SetType(NDLType type) {m_type = type;}
-    NDLType GetType() const {return m_type;}
-    const std::string& GetName() const {return m_name;}
-    void SetName(std::string &name) {m_name = name;}
-    ConfigValue GetValue() const {return m_value;}
-    void SetValue(std::string &value) {m_value = value;}
-
-    // parameters of a function (ndlTypFunction), or parameters in the call to a macro
-    void SetParamString(ConfigValue paramString) {m_paramString = paramString;} 
-    ConfigArray GetParamString() const {return m_paramString;}
-
-    // parameters of a macro
-    void SetParamMacro(ConfigValue paramMacro) {m_paramMacro = paramMacro;} 
-    ConfigArray GetParamMacro() const {return m_paramMacro;}
-
-    void SetParentScript(NDLScript<ElemType>* script) {m_parent = script;}
-    NDLScript<ElemType>* GetParentScript() { return m_parent; } 
-
-    // get parameters, either just optional or just regular
-    vector<NDLNode*> GetParameters(bool optional=false) const
-    { 
-        vector<NDLNode*> result;
-        for (NDLNode* param : m_parameters)
-        {
-            bool optParam = param->GetType() == ndlTypeOptionalParameter;
-            if (optParam == optional)
-                result.push_back(param);
-        }
-        return result;
-    }
-    
-    // Get/Set eval values
-    void* GetEvalValue() const { return m_eval;}
-    void SetEvalValue(void* evalValue) {m_eval = evalValue;}
-
-    // GetOptionalParameter - Get an optional parameter value
-    // name - the name to search for in the optional parameters
-    // deflt - the default value (if not found)
-    // returns: parameter value if found, or default value otherwise
-    ConfigValue GetOptionalParameter(const std::string& name, const std::string& deflt) const
-    {
-        for (NDLNode* param : m_parameters)
-        {
-            bool optParam = param->GetType() == ndlTypeOptionalParameter;
-            if (optParam && !_stricmp(param->GetName().c_str(), name.c_str()))
-            {
-                return param->GetValue();
-            }
-        }
-        return ConfigValue(deflt);
-    }
-
-    // FindNode - Find a node of the given name
-    // name - name to search for
-    // searchForDotNames - search for NDL symbols traversing call heirarchy
-    // returns: The node with that name, or NULL if not found
-    NDLNode* FindNode(const std::string& name, bool searchForDotNames=false)
-    {
-        NDLNode* found = m_parent->FindSymbol(name, searchForDotNames);
-        if (!found)
-            found = NDLScript<ElemType>::GlobalScript().FindSymbol(name, searchForDotNames);
-        return found;
-    }
-
-    // GetScalar - Get a scalar value from a node, may loop through some variables before arriving
-    // returns: scalar value
-    ConfigValue GetScalar()
-    {
-        NDLNode<ElemType>* node = this;
-        while (node && (node->GetType() == ndlTypeVariable || node->GetType() == ndlTypeParameter))
-        {
-            NDLNode<ElemType>* nodeLast = node;
-            node = node->FindNode(node->GetValue(), true /*searchForDotNames*/);
-
-            // if we are still on the same node, that means it was never resolved to anything, an undefined variable
-            if (nodeLast == node)
-            {
-                RuntimeError("undefined Variable, '%s' found, must be declared before first use\n", node->GetName().c_str());
-            }
-        }
-        if (!node || node->GetType() != ndlTypeConstant)
-        {
-            std::string name = node ? node->GetName() : GetName();
-            RuntimeError("Scalar expected, '%s' must be a constant or variable that resolves to a constant\n", name.c_str());
-        }
-        return node->GetValue();
-    }
-
-    void InsertParam(NDLNode* param) {m_parameters.push_back(param);}
-
-    // EvaluateMacro - Evaluate a macro, make the call
-    // nodeEval - the node evaluator we are using to interpret the script
-    // baseName - base name for all symbols at this level
-    // pass - what NDLPass are we in?
-    // returns: the return node for this macro
-    NDLNode<ElemType>* EvaluateMacro(NDLNodeEvaluator<ElemType>& nodeEval, const wstring& baseName, const NDLPass pass)
-    {
-        if (m_type != ndlTypeMacroCall)
-            return NULL;
-
-        // make sure the actual parameters and expected parameters match
-        if (m_parameters.size() < m_paramMacro.size())
-        {
-            RuntimeError("Parameter mismatch, %d parameters provided, %d expected in call to %s\n",
-                m_parameters.size(),m_paramMacro.size(),m_value.c_str());
-        }
-
-        // assign the actual parameters in the script so we can execute it
-        for (int i=0; i < m_parameters.size(); ++i)
-        {
-            NDLNode<ElemType>* nodeParam = m_parameters[i];
-            std::string paramName = i < m_paramMacro.size()?m_paramMacro[i]:nodeParam->GetName();
-
-            // if the node is a parameter then look it up in the symbol table
-            if (nodeParam->GetType() == ndlTypeParameter)
-            {
-                nodeParam = m_parent->FindSymbol(nodeParam->GetName());
-            }
-            // do we want to add optional parameters as symbols, or not?
-            else if (nodeParam->GetType() == ndlTypeOptionalParameter)
-            {
-                if (i < m_paramMacro.size())
-                    RuntimeError("Parameter mismatch, parameter %d is an optional parameter, but should be a required parameter\n",i);
-                // if no symbol yet, add it
-                if (!m_script->ExistsSymbol(paramName))
-                {
-                    m_script->AddSymbol(paramName, nodeParam);
-                    continue;
-                }
-                //else assign the value below
-            }
-
-            // assign the parameter symbols in the script we will call with the values passed to the call
-            m_script->AssignSymbol(paramName, nodeParam);
-
-        }
-
-        std::wstring newBase = baseName;
-        if (!newBase.empty())
-            newBase += L".";
-        newBase += msra::strfun::utf16(m_name);
-
-        // now evaluate  the contained macro script
-        NDLNode<ElemType>* nodeResult = m_script->Evaluate(nodeEval, newBase, pass);
-        // Consider: do we need to restore the original mapping here, may need to for recursive calls?
-
-        // look for a symbol that is identical to the macro name, if it exists this is the return value
-        NDLNode<ElemType>* nodeMacroName = m_script->FindSymbol(m_value);
-        if (nodeMacroName)
-        {
-            nodeResult = nodeMacroName;
-        }
-
-        // set the eval node to be the same as the return value;
-        if (nodeResult)
-        {
-            m_eval = nodeResult->GetEvalValue();
-        }
-        return nodeResult;
-    }
-};
-
-template <typename ElemType>
-class NDLScript: public ConfigParser
-{
-private:
-    std::wstring m_baseName;
-    std::string m_scriptString;
-    std::vector<NDLNode<ElemType>*> m_script; // script lines in parsed node order, macros will have definition followed by body
-    std::map<std::string, NDLNode<ElemType>*, nocase_compare> m_symbols; // symbol table
-    NDLNode<ElemType>* m_macroNode; // set when interpretting a macro definition
-    bool m_noDefinitions; // no definitions can be made in this script, interpret all macro/function names as calls
-    static NDLScript<ElemType> s_global; //("global"); // global script for storing macros and global nodes
-    std::vector<NDLNode<ElemType>*> m_children; // child nodes. Note that m_script nodes may not be children of this object, they include macro nodes
-    ComputationNetwork<ElemType>* m_cn; // computation network to use for backup symbol lookup. Used for MEL where NDL and network nodes are mixed
-    bool m_definingMacro; // currently defining a macro, flag to determine if we are defining or interpretting a macro call
-
-public:
-    // constructors that take a config name
-    NDLScript(const std::string & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; }
-    NDLScript(const std::wstring & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; }
-    ~NDLScript() 
-    {
-        // need to free all the child nodes attached to this script node
-        for (NDLNode<ElemType>* node : m_children)
-        {
-            delete node;
-        }
-        m_children.clear();
-    }
-
-    // empty constructor 
-    NDLScript() : ConfigParser(';') { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } // parameterless version if needed
-
-    // construct NDLScript from a ConfigValue, propogate the config Name
-    NDLScript(const ConfigValue& configValue) : ConfigParser(';',configValue.Name())
-    {
-        m_macroNode = NULL;
-        m_noDefinitions=false;
-        m_definingMacro = false;
-        m_scriptString = configValue;
-        Parse(m_scriptString);
-    }
-
-    // construct NDLScript from a ConfigValue, propogate the config Name
-    // configValue - the body of the macro
-    // oneLineDefinition - this macro definition is all on one line, names optional
-    // macroName - if the macro has a name, the name - this is used to get parameter info
-    NDLScript(const ConfigValue& configValue, std::string macroName, bool oneLineDefinition) : ConfigParser(';',configValue.Name())
-    {
-        m_noDefinitions = oneLineDefinition;
-        m_definingMacro = true;
-        m_macroNode = NULL;
-        m_scriptString = configValue;
-        NDLNode<ElemType>* ndlNode = s_global.CheckName(macroName, true);
-        if (ndlNode == NULL)
-            RuntimeError("Invalid macro definition, %s not found", macroName.c_str());
-
-        // get and parse the parameters
-        ConfigArray parameters = ndlNode->GetParamMacro();
-        for (auto iter = parameters.begin(); iter != parameters.end(); ++iter)
-        {
-            // we are adding parameters that will be replaced by actual values later
-            ConfigValue param = *iter;
-
-            // check to make sure this parameter name is not a reserved word
-            std::string functionName = param;
-            // check for function name, a function may have two valid names
-            // in which case 'functionName' will get the default node name returned
-            if (CheckFunction<ElemType>(functionName))
-            {
-                RuntimeError("NDLScript: Macro %s includes a parameter %s, which is also the name of a function. Parameter names may not be the same as function names.", macroName.c_str(), param.c_str());
-            }
-
-            NDLNode<ElemType>* paramNode = new NDLNode<ElemType>(param, param, this, ndlTypeParameter);
-            // add to node parameters
-            ndlNode->InsertParam(paramNode);
-            // add to script symbol table
-            AddSymbol(param, paramNode);
-        }
-        Parse(m_scriptString);
-        m_definingMacro = false;
-    }
-
-
-    // copy and move constructors
-    NDLScript(const NDLScript& copyMe);
-    NDLScript(const NDLScript&& moveMe);
-private:
-    NDLNode<ElemType>* DuplicateNode(NDLNode<ElemType>* node);
-public:
-    // GlobalScript - Access to global script
-    static NDLScript<ElemType>& GlobalScript() {return s_global;}
-
-    // SetMacroDefinitionsAllowed - allow macro definitions
-    // macroAllowed - can macros be defined in this script?
-    void SetMacroDefinitionsAllowed(bool macroAllowed)
-    {
-        m_noDefinitions = !macroAllowed;
-    }
-
-    void SetBaseName(const std::wstring& baseName)
-    {
-        m_baseName = baseName;
-    }
-    const std::wstring& GetBaseName()
-    {
-        return m_baseName;
-    }
-
-    void ClearGlobal()
-    {
-        s_global.Clear();
-    }
-
-    void Clear()
-    {
-
-        for (NDLNode<ElemType>* node : m_children)
-        {
-            delete node;
-        }
-        m_children.clear();
-        for (NDLNode<ElemType>* node : m_script)
-        {
-            delete node;
-        }
-        m_script.clear();
-
-        m_symbols.clear();
-    }
-    void ClearEvalValues()
-    {
-        for (NDLNode<ElemType>* node : m_children)
-        {
-            node->SetEvalValue(NULL);
-        }
-    }
-    // AddChild - add a child node to the script
-    // node - node to add
-    // NOTE: this NDLScript owns this node and is responsible to delete it
-    void AddChild(NDLNode<ElemType>* node) 
-    {
-        m_children.push_back(node);
-    }
-
-    // SetComputationNetwork - set the computation network this NDL is associated with
-    void SetComputationNetwork(ComputationNetwork<ElemType>* cn)
-    {
-        m_cn = cn;
-    }
-
-    // FindSymbol - Find a symbol to the symbol table
-    // symbol - symbol to find
-    // searchForDotNames - search for NDL symbols traversing call heirarchy
-    // returns - node this symbol references
-    NDLNode<ElemType>* FindSymbol(const std::string& symbol, bool searchForDotNames=true)
-    {
-        auto found = m_symbols.find(symbol);  //search symbol directly first
-        if (found != m_symbols.end())
-            return found->second;
-
-        // if not found, handle dot names by move up the hierarchy
-        size_t firstDot = symbol.find_first_of('.');
-        if (firstDot == npos)
-            return nullptr;
-
-        std::string search = symbol.substr(0,firstDot);
-        found = m_symbols.find(search);
-        if (found == m_symbols.end())
-        {
-            return NULL;
-        }
-
-        // handle dot names, 
-        if (firstDot != npos)
-        {
-            NDLNode<ElemType>* node = found->second;
-            NDLScript<ElemType>* script = node->GetScript();
-            // if there is no script, probably a parameter/variable with further 'dot' values (ie. var.CE.BFF)
-            if (script != NULL)
-            {
-                if (node->GetType() != ndlTypeMacroCall || script == NULL)
-                    RuntimeError("Symbol name not valid, %s is not a macro, so %s cannot be interpretted",search.c_str(),symbol.c_str() );
-                return script->FindSymbol(symbol.substr(firstDot+1), searchForDotNames);
-            }
-        }
-        return found->second;
-    }
-
-    // ExistsSymbol - Find if a symbol exists (value might be NULL)
-    // symbol - symbol to find
-    // returns - true if it's there
-    bool ExistsSymbol(const std::string& symbol)
-    {
-        auto found = m_symbols.find(symbol);
-        return (found != m_symbols.end());
-    }
-
-    // ContainsOptionalParameter - do any nodes in this script have an optional parameter by the following name?
-    // optParamName - name of parameter we are searching for
-    // returns: vector of the nodes found (empty if nothing found)
-    vector<NDLNode<ElemType>*> ContainsOptionalParameter(const std::string& optParamName)
-    {
-        vector<NDLNode<ElemType>*> result;
-        std::string empty;
-        for (auto symbol : m_symbols)
-        {
-            NDLNode<ElemType>* node = symbol.second;
-            std::string value = node->GetOptionalParameter(optParamName, empty);
-            if (!value.empty())
-            {
-                result.push_back(node);
-            }
-        }
-        return result;
-    }
-
-    // AddSymbol - Add a symbol to the symbol table
-    // symbol - symbol to add
-    // node - node this symbol references
-    // NOTE: at present we don't allow reuse of a symbol, so this throws an error if it sees an existing symbol
-    void AddSymbol(const std::string& symbol, NDLNode<ElemType>* node)
-    {
-        auto found = m_symbols.find(symbol);
-        if (found != m_symbols.end())
-        {
-            NDLNode<ElemType>* nodeFound = found->second;
-            // check for undetermined nodes, because these nodes are to be defined later
-            if (nodeFound->GetType() != ndlTypeUndetermined && nodeFound->GetType() != ndlTypeParameter)
-            {
-                std::string value = found->second->GetValue();
-                RuntimeError("Symbol '%s' currently assigned to '%s' reassigning to a different value not allowed\n", symbol.c_str(), value.c_str());
-            }
-        }
-        m_symbols[symbol] = node;
-    }
-
-    // AssignSymbol - Assign a new value to a symbol in the table
-    // symbol - symbol to assign
-    // node - node this symbol will reference
-    void AssignSymbol(const std::string& symbol, NDLNode<ElemType>* node)
-    {
-        auto found = m_symbols.find(symbol);
-        if (found == m_symbols.end())
-        {
-            RuntimeError("Symbol '%s' currently does not exist, attempting to assigned value '%s' AssignSymbol() requires existing symbol\n", symbol.c_str(), node->GetValue().c_str());
-        }
-        m_symbols[symbol] = node;
-    }
-
-
-    // FileParse - parse at the file level, can be overridden for "section of file" behavior
-    // stringParse - file concatentated as a single string
-    void FileParse(const std::string& stringParse)
-    {
-        ConfigParameters sections(stringParse);
-        bool loadOrRunFound = false;
-
-        // load all the sections that we want (macros)
-        if (sections.Exists("load"))
-        {
-            auto config = ConfigArray(sections("load"));
-            for (int i=0; i < config.size(); ++i)
-            {
-                Parse(sections(config[i]));
-            }
-            loadOrRunFound = true;
-        }
-
-        // load and then execute 
-        if (sections.Exists("run"))
-        {
-            auto config = ConfigArray(sections("run"));
-            for (int i=0; i < config.size(); ++i)
-            {
-                Parse(sections(config[i]));
-            }
-            loadOrRunFound = true;
-        }
-
-        // didn't find any of the tags, so just parse the whole thing as a script
-        if (!loadOrRunFound)
-        {
-            // surround text in braces so we parse correctly
-            std::string textInBraces = "[ "+stringParse+" ]";
-            Parse(textInBraces);
-        }
-    }
-
-    // IsMacroDefinition - is this a macro definition?
-    // returns - true if a definition, otherwise false
-    bool IsMacroDefinition()
-    {
-        return m_definingMacro;
-    }
-
-    // CheckName - check for a name in our symbols, see if it exists
-    // name - name we are looking for
-    // localOnly - only look in the current scope, and not the global scope
-    // if it does exist return the node that represents the name
-    NDLNode<ElemType>* CheckName(const std::string& name, bool localOnly=false)
-    {
-        // first try local script
-        auto found = FindSymbol(name);
-        if (found != NULL)
-        {
-            return found;
-        }
-
-        // next try the globals, this includes macros and global constants
-        if (!localOnly)
-        {
-            auto found = s_global.FindSymbol(name);
-            if (found != NULL)
-            {
-                NDLNode<ElemType>* node = found;
-                if (node->GetType() == ndlTypeMacro)
-                {
-                    // if we are calling a macro we need to keep track of formal parameters, 
-                    // keep them as strings in this macroCall node
-                    NDLNode<ElemType>* newNode = new NDLNode<ElemType>("", name, this, ndlTypeMacroCall);
-                    NDLScript<ElemType>* script = node->GetScript();
-
-                    // if this is a macro call (and not a definition), we want to expand the macro (make a copy)
-                    if (!IsMacroDefinition())
-                    {
-                        script = new NDLScript<ElemType>(*script);
-                    }
-                    newNode->SetScript(script);
-
-                    newNode->SetParamMacro(node->GetParamMacro());
-                    node = newNode;
-                }
-                return node;
-            }
-        }
-
-        std::string functionName = name;
-        // check for function name, a function may have two valid names
-        // in which case 'functionName' will get the default node name returned
-        if (CheckFunction<ElemType>(functionName))
-        {
-            NDLNode<ElemType>* ndlNode = new NDLNode<ElemType>("", functionName, this, ndlTypeFunction);
-            return ndlNode;
-        }
-
-        // not found, return NULL
-        return NULL;
-    }
-
-    // CallStringParse - parse the string description of a call sequence
-    // token - [in] string description of the call
-    // nameFunction - [out] name of the function being called
-    // params - [out] parameters to the function, set to empty string if no parameters
-    // returns: the node (if it exists) that matches this function name, otherwise NULL
-    NDLNode<ElemType>* CallStringParse(const std::string& token, std::string& nameFunction, std::string& params)
-    {
-        auto paramStart = token.find_first_of(OPENBRACES);
-        if (paramStart == npos)
-            RuntimeError("Invalid macro/function call can not be parsed: %s\n", token.c_str());
-        nameFunction = token.substr(0, paramStart);
-        Trim(nameFunction);
-        params = token.substr(paramStart);
-        NDLNode<ElemType>* ndlNodeFound = CheckName(nameFunction);
-        return ndlNodeFound;
-    }
-
-
-    // ParseParameters - parse the parameters of a macro, or an array
-    // ndlNode - node we should add the parameters to
-    // value - parameters as config value
-    // createNew - create a new parameter node if one does not exist
-    void ParseParameters(NDLNode<ElemType>* ndlNode, const ConfigValue& value, bool createNew=false)
-    {
-        ConfigArray parameters = value;
-        for (auto iter = parameters.begin(); iter != parameters.end(); ++iter)
-        {
-            ConfigValue param = *iter;
-            NDLNode<ElemType>* paramNode = NULL;
-            auto foundBrace = param.find_first_of(FUNCTIONOPEN);
-            if (foundBrace != npos) // a nested call as a parameter
-                paramNode = ParseCall(param);
-            else // must be predefined variable or constant
-            {
-                paramNode = ParseVariable(param, createNew);
-
-                // if we can't find the node right now, it's undetermined, must be defined later, or throw an error later
-                if (paramNode == nullptr)
-                {
-                    paramNode = new NDLNode<ElemType>(param, param, this, ndlTypeUndetermined);
-                    // add to the symbol table
-                    AddSymbol(param, paramNode);
-                }
-            }
-            if (paramNode == NULL)
-            {
-                RuntimeError("variable name '%s' not found, must be previously defined\n", param.c_str());
-            }
-            else
-            {
-                ndlNode->InsertParam(paramNode);
-            }
-        }
-    }
-
-    // ParseVariable - parse a variable or constant
-    // token - string containing the variable or constant
-    // createNew - create a new variable node if no node found
-    // returns: the node that represents this newly defined variable
-    NDLNode<ElemType>* ParseVariable(const std::string& token, bool createNew=true)
-    {
-        NDLNode<ElemType>* ndlNode = NULL;
-        auto openBrace = token.find_first_of(OPENBRACES);
-        if (openBrace == 0)
-        {
-            ndlNode = new NDLNode<ElemType>("", token, this, ndlTypeArray);
-            ndlNode->SetParamString(token);
-            ParseParameters(ndlNode, token);
-            return ndlNode;
-        }
-
-        auto found = token.find_first_not_of("+-.0123456789eE");
-        // see if it's a numeric constant
-        if (found == npos)
-        {
-            ndlNode = new NDLNode<ElemType>("", token, this, ndlTypeConstant);
-        }
-        // not a constant, so must be a variable
-        else
-        {
-            // look for an optional parameter
-            auto foundEqual = token.find_first_of('=');
-            bool optional = (foundEqual != npos);
-            if (optional)
-            {
-                std::string name = token.substr(0, foundEqual);
-                Trim(name);
-                std::string value = token.substr(foundEqual+1);
-                Trim(value);
-                
-                ndlNode = new NDLNode<ElemType>(name, value, this, ndlTypeOptionalParameter);
-            }
-            else
-            {
-                ndlNode = CheckName(token);
-                if (createNew && ndlNode == NULL)
-                {
-                    // NOTE: currently we only get here in Parameter scenarios, 
-                    // if other scenarios present themselves, need a good way to change the type
-                    ndlNode = new NDLNode<ElemType>(token, token, this, ndlTypeParameter);
-                    AddSymbol(token, ndlNode);
-                }
-            }
-        }
-        return ndlNode;
-    }
-
-    // ParseDefinition - parse a macro definition
-    // token - string containing the macro definition (without the macro body)
-    // returns: the node that represents this newly defined macro
-    NDLNode<ElemType>* ParseDefinition(const std::string& token)
-    {
-        std::string nameFunction, params;
-        NDLNode<ElemType>* ndlNode = CallStringParse(token, nameFunction, params);
-        if (ndlNode)
-            RuntimeError("function '%s' already defined\n", nameFunction.c_str());
-        ndlNode = new NDLNode<ElemType>(nameFunction, params, &s_global, ndlTypeMacro);
-
-        // now set the variables/parameters which will be parsed when the body shows up
-        ndlNode->SetParamMacro(params);
-
-        // now add this to the globals
-        s_global.AddSymbol(nameFunction,ndlNode);
-
-        // NOTE: the body of the Macro will be parsed separately, this just sets up the node
-        return ndlNode;
-    }
-
-    // ParseCall - parse the call syntax out into "function" and variables
-    // token - string containing the "call"
-    // return - Node pointer, the newly created node 
-    NDLNode<ElemType>* ParseCall(const std::string& token)
-    {
-        std::string nameFunction, params;
-        NDLNode<ElemType>* ndlNode = CallStringParse(token, nameFunction, params);
-
-        if (ndlNode == NULL)
-            RuntimeError("Undefined function or macro '%s' in %s\n", nameFunction.c_str(), token.c_str());
-
-        // now setup the variables/parameters
-        ConfigValue value = ConfigValue(params, nameFunction);
-
-        ndlNode->SetParamString(value);
-        ParseParameters(ndlNode, value);
-        return ndlNode;
-    }
-
-    // parse a 'key=value' pair and create the appropriate node for what was seen
-    // 'key=Function(x,y,z)' - function
-    // 'macro(x,y)={z=Input(x,y)}
-    // may also be Function(x,y,z), a nameless call (used in one-line macros)
-    std::string::size_type ParseValue(const std::string& stringParse, std::string::size_type tokenStart, std::string::size_type tokenEnd)
-    {
-        // first find previous character
-
-        // skip leading spaces
-        tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
-        auto keyEnd = stringParse.find_first_of(OPENBRACES"=", tokenStart);
-        bool equalFound = (keyEnd != npos && keyEnd < tokenEnd && stringParse[keyEnd] == '=');
-
-        // this should be the body of the macro
-        if (m_macroNode)
-        {
-            bool oneLineDefinition = false;
-            NDLNode<ElemType>* macroNode = m_macroNode;
-
-            // an '=' at the beginning, skip it
-            if (keyEnd == tokenStart && equalFound)
-            {
-                // skip the '=' sign
-                oneLineDefinition = true;
-                tokenStart = stringParse.find_first_not_of(" \t", tokenStart+1);
-                if (tokenStart == npos)
-                    RuntimeError("Body of Macro missing");
-            }
-
-            NDLScript<ElemType>* script = new NDLScript<ElemType>(ConfigValue(stringParse.substr(tokenStart, tokenEnd-tokenStart), macroNode->GetName()), macroNode->GetName(), oneLineDefinition);
-            macroNode->SetScript(script);
-
-            // reset so we know we are done with the body
-            m_macroNode = NULL;
-
-            return tokenEnd;    // done with the macro now
-        }
-
-        // if we hit the end of the token before we hit an equal sign, it's a 'macro(x,y)' definition
-        // unless we are a one-line macro in which case we don't allow definitions
-        if (!m_noDefinitions && !equalFound)
-        {
-            keyEnd = stringParse.find_first_of(OPENBRACES, tokenStart);
-            if (keyEnd == npos || keyEnd >= tokenEnd)
-                RuntimeError("Invalid statement, does not contain an '=' sign: %s\n", stringParse.substr(tokenStart, tokenEnd-tokenStart).c_str());
-            m_macroNode = ParseDefinition(stringParse.substr(tokenStart, tokenEnd-tokenStart));
-            // the body of the macro will come through next time
-            return tokenEnd;
-        }
-
-        // get the key value (symbol name)
-        std::string key;
-
-        // no macro definitions allowed, so no equal means a function call
-        if (m_noDefinitions && !equalFound)
-        {
-            ;// nothing to do here, just skip the "key=" parsing below
-        }
-        else
-        {
-            key = stringParse.substr(tokenStart, keyEnd-tokenStart);
-            Trim(key);
-
-            // check to make sure variable name isn't a valid function name as well
-            string strTemp = key;
-            if (CheckFunction<ElemType>(strTemp))
-                RuntimeError("variable %s is invalid, it is reserved because it is also the name of a function", key.c_str());
-
-            tokenStart = keyEnd;
-            if (stringParse[keyEnd] == '=')
-                ++tokenStart;
-
-            // skip any spaces before the second token
-            tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
-        }
-        std::string::size_type substrSize = tokenEnd - tokenStart;
-
-        auto bracesEnd = FindBraces(stringParse, tokenStart);
-
-        // if braces found, we modify the token end according to braces
-        if (bracesEnd != npos)
-        {   // include the trailing brace
-            tokenEnd = bracesEnd+1;
-            substrSize = tokenEnd - tokenStart;
-
-            // for quote delimited string remove quotes
-            if (stringParse[tokenStart] == '"')
-            {
-                tokenStart++;
-                substrSize -= 2;    // take out the quotes
-            }
-        }
-
-        if (substrSize == 0)
-            return npos;
-
-        // get the value
-        std::string value = stringParse.substr(tokenStart, substrSize);
-        Trim(value);
-
-        NDLNode<ElemType>* ndlNode = NULL;
-
-        // check for a function/macro call
-        auto found = value.find_first_of(FUNCTIONOPEN);
-        if (found != npos && found > 0) // brace found after some text, so a call
-        {
-            ndlNode = ParseCall(value);
-            // check if we have a user defined name, ParseCall assigns a default name
-            if (!key.empty())
-                ndlNode->SetName(key);
-            AddSymbol(ndlNode->GetName(),ndlNode);
-            m_script.push_back(ndlNode);
-        }
-        // if it's not a call, must be a variable
-        else
-        {
-            ndlNode = ParseVariable(value);
-            bool newNode = ndlNode->GetName().empty();
-            AddSymbol(key,ndlNode);
-
-            ndlNode->SetName(key);
-            if (newNode) //only need to add nodes that are new (not renames)
-            {
-                m_script.push_back(ndlNode);
-            }
-        }
-
-        return tokenEnd;
-    }
-
-    // ExpandMacro - Expand a macro into a new macro definition
-    // node - NDLNode that holds the macro call
-    // returns: new node with the expanded macro
-    NDLNode<ElemType>* ExpandMacro(const NDLNode<ElemType>* node)
-    {
-        assert(node->GetType() == ndlTypeMacroCall); // needs to be a macro call (not definition)
-
-        std::string name = node->GetName();
-        // if we are calling a macro make a new copy of it and execute that instead (macro expansion)
-        // we do this so the evalValues in the macros will be valid regardless of number of instantiations
-        NDLNode<ElemType>* newNode = new NDLNode<ElemType>(name, node->GetValue(), this, ndlTypeMacroCall);
-        NDLScript<ElemType>* newScript = new NDLScript<ElemType>(*node->GetScript());
-        newNode->SetScript(newScript);
-        newNode->SetParamMacro(node->GetParamMacro());
-
-        // now get the parameters to the macro added
-        ConfigValue paramString = node->GetParamString();
-        ParseParameters(newNode, paramString, true /*createNew*/);
-        newNode->SetParamString(paramString);
-
-        // fixup the symbol table to point to this one instead
-        AssignSymbol(name, newNode);
-        return newNode;
-    }
-
-    // Evaluate - Evaluate the script
-    // nodeEval - the node evaluator to call
-    // baseName - baseName for all labels
-    // pass - what NDLPass are we on?
-    // skipThrough - skip through this node, will skip eval for all nodes up to and including this one
-     NDLNode<ElemType>*  Evaluate(NDLNodeEvaluator<ElemType>& nodeEval, const wstring& baseName, const NDLPass pass=ndlPassInitial, NDLNode<ElemType>* skipThrough=nullptr)
-    {
-        NDLNode<ElemType>* nodeLast = skipThrough;
-        bool skip = skipThrough != nullptr;
-        std::wstring prevBaseName = GetBaseName();
-        SetBaseName(baseName);
-
-        for (auto& node : m_script)
-        {
-            // if we are in skip mode, and we found the skipThrough node, 
-            // move out of skip mode and start processing at next node
-            if (skip)
-            {
-                if (node == skipThrough)
-                    skip = false;
-                continue;
-            }
-
-            // if it's a macro call, call the macro
-            if (node->GetType() == ndlTypeMacroCall)
-            {
-                node->EvaluateMacro(nodeEval, baseName, pass);
-                nodeEval.ProcessOptionalParameters(node);
-            }
-            else
-            {
-                nodeEval.Evaluate(node, baseName, pass);
-            }
-            nodeLast = node;
-        }
-        SetBaseName(prevBaseName);
-        return nodeLast;
-    }
-};
-
-}}}
+//
+// <copyright file="NetworkDescriptionLanguage.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+#include "commandArgUtil.h"
+#include "ComputationNode.h"
+#include "TrainingCriterionNode.h"
+#include "CompositeComputationNode.h"
+#include "EvaluationCriterionNode.h"
+#include "ComputationNetwork.h"
+#include <stdarg.h>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// EqualInsensitive - check to see if two nodes are equal up to the length of the first string (must be at least half as long as actual node name)
+// string1 - [in,out] string to compare, if comparision is equal insensitive but not sensitive, will replace with sensitive version
+// string2 - second string to compare
+// alternate - alternate naming of the string
+// return - true if strings are equal insensitive and modifies string1 to sensitive version if different
+bool EqualInsensitive(std::wstring& string1, const std::wstring& string2, const wchar_t* alternate=NULL);
+
+// CheckFunction - check to see if we match a function name
+// string1 - [in,out] string to compare, if comparision is equal and at least half the full node name will replace with full node name
+// allowUndeterminedVariable - [out] set to true if undetermined variables (symbols yet to be defined) are allowed here
+// return - true if function name found
+template <typename ElemType>
+bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable=nullptr);
+
+// NDLType - Network Description Language node type
+enum NDLType
+{
+    ndlTypeNull,
+    ndlTypeConstant,
+    ndlTypeFunction,
+    ndlTypeVariable,
+    ndlTypeParameter, // parameter value, must be looked up to get actual value
+    ndlTypeUndetermined, // an undetermined value that will later be resolved
+    ndlTypeOptionalParameter,
+    ndlTypeArray,
+    ndlTypeMacroCall, // calling a macro
+    ndlTypeMacro, // definition of a macro
+    ndlTypeMax
+};
+
+// NDLPass - enumeration for the number of passes through the NDL parser
+enum NDLPass
+{
+    ndlPassInitial, // inital pass, create nodes
+    ndlPassResolve, // resolve any undetermined symbols (variables that were not yet declared in NDL)
+    ndlPassFinal, // final pass done post-validation (when all matrices are allocated to the correct size)
+    ndlPassAll = ndlPassFinal, // all passes, used as flag in NDLUtil.h
+    ndlPassMax // number of NDLPasses
+};
+
+// ++ operator for this enum, so loops work
+NDLPass &operator++(NDLPass &ndlPass);
+
+// Predeclaration of Script and Node
+template <typename ElemType>
+class NDLScript;
+
+template <typename ElemType>
+class NDLNode;
+
+// NDLNodeEvaluator - Node evaluaton interface
+// implemented by execution engines to convert script to approriate internal formats
+template <typename ElemType>
+class NDLNodeEvaluator 
+{
+public:
+    virtual void Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass) = 0;
+    virtual ~NDLNodeEvaluator() = 0;
+
+    // EvaluateParameter - Evaluate a parameter of a call
+    // node - NDLNode of the script
+    // nodeParam - NDLNode parameter we are evaluating
+    // baseName - name of the base node
+    // pass - which pass through the NDL nodes
+    // returns: the node that is the evaluated parameter
+    virtual NDLNode<ElemType>* EvaluateParameter(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseName, const NDLPass pass ) = 0;
+
+    // EvaluateParameters - Evaluate the parameters of a call
+    // node - NDLNode we are evaluating paramters for
+    // baseName - baseName for the current node
+    // nodeParamStart - starting parameter that contains a node
+    // nodeParamCount - ending parameter that contains a node
+    // pass - NDL pass we are evaluating
+    // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator
+    virtual std::vector<void*> EvaluateParameters(NDLNode<ElemType>* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) = 0;
+
+    // FindSymbol - Search the engines symbol table for a fully quantified symbol
+    // symbol - name of the symbol
+    // returns - pointer to the matching EvalValue for that node, of NULL if not found
+    virtual void* FindSymbol(const wstring& /*symbol*/)
+    {
+        return NULL;
+    }
+    // ProcessOptionalParameters - Process the optional parameters of a node
+    // node to process
+    virtual void ProcessOptionalParameters(NDLNode<ElemType>* /*node*/)
+    {
+        return;
+    }
+
+};
+
+template class NDLNodeEvaluator<float>; 
+template class NDLNodeEvaluator<double>;
+
+template <typename ElemType>
+class NetNdl // class to associate a network with an NDLScript
+{
+public:
+    ComputationNetwork<ElemType>* cn;
+    NDLScript<ElemType>* ndl;  // NDLScript we are using for this network. NOTE: the actual script used 
+    NDLNode<ElemType>* lastNode[ndlPassMax]; // last node we evaluated for each pass
+    NetNdl(): cn(nullptr), ndl(nullptr) {ClearLastNodes();}
+    NetNdl(ComputationNetwork<ElemType>*p_cn): cn(p_cn), ndl(nullptr) {ClearLastNodes();}
+    NetNdl(ComputationNetwork<ElemType>*p_cn, NDLScript<ElemType>* p_ndl): cn(p_cn), ndl(p_ndl) {ClearLastNodes();}
+    ~NetNdl()
+    {}
+
+    // ClearLastNodes - Clear out the last node values for all passes
+    void ClearLastNodes()
+    {
+        for (NDLPass pass=ndlPassInitial;pass < ndlPassMax;++pass)
+        {
+            lastNode[pass] = nullptr;
+        }
+    }
+
+    // Clear - clear out everything in the structure
+    // NOTE: this deletes the network and the NDLScript, use with care!
+    void Clear()
+    {
+        delete cn;
+        delete ndl;
+        cn = nullptr;
+        ndl = nullptr;
+        ClearLastNodes();
+    }
+};
+
+template <typename ElemType>
+inline NDLNodeEvaluator<ElemType>::~NDLNodeEvaluator() { }  // defined even though it's virtual; supposed to be faster this way
+
+// NDLNode - Network Description Language Node
+// Used to represent a named entity in the NDL
+// if a name is not provided (such as in nesting scenarios) one will be generated
+template <typename ElemType>
+class NDLNode
+{
+private:
+    std::string m_name; // value on the left of the equals
+    ConfigValue m_value; // value on the right of the equals (CN node name, or value)
+    NDLScript<ElemType>* m_parent; // parent script
+    NDLType m_type; //type of node
+    ConfigArray m_paramString; // parameter of a function/array
+    ConfigArray m_paramMacro; // parameter of a macro (the variables used in the macro definition)
+    vector<NDLNode*> m_parameters;  // parameters as nodes/array elements
+    void *m_eval; // pointer to an arbitrary eval structure
+    NDLScript<ElemType>* m_script; // script for ndlTypeMacro
+    static int s_nameCounter; // counter for generating unique names
+public:
+    NDLNode(const std::string& name, ConfigValue value, NDLScript<ElemType>* parent, NDLType ndlType)
+    {
+        if (name.empty())
+            GenerateName();
+        else
+            m_name = name;
+        m_value = value;
+        m_parent = parent;
+        assert(parent != NULL);
+        parent->AddChild(this);
+        m_type = ndlType;
+        m_eval = NULL;
+        m_script = NULL;
+    }
+
+    ~NDLNode()
+    {}
+
+    // publicly accessible Copy method
+    // should only be used for macro expansion
+    NDLNode* Copy() const
+    {
+        NDLNode* ret = new NDLNode(*this);
+        return ret;
+    }
+
+private:
+
+    // copy constructor, creates a new disconnected copy of this node for macro expansion
+    NDLNode(const NDLNode& copyMe);
+
+    NDLNode& operator=(NDLNode& /*copyMe*/)  //this is just a place holder implementation which is not functioning but prevent callers to use it.
+    {            
+        throw std::logic_error("'NDLNode& operator=(NDLNode& copyMe)' should never be called.");
+    } 
+
+    // generate a generic symbol name for a node
+    void GenerateName()
+    {
+        char buffer[10];
+        sprintf(buffer, "%d", ++s_nameCounter);
+        m_name = std::string("unnamed") + buffer;
+    }
+
+public:
+    void SetScript(NDLScript<ElemType>* script) {m_script = script;}
+    NDLScript<ElemType>* GetScript() const {return m_script;}
+    void SetType(NDLType type) {m_type = type;}
+    NDLType GetType() const {return m_type;}
+    const std::string& GetName() const {return m_name;}
+    void SetName(std::string &name) {m_name = name;}
+    ConfigValue GetValue() const {return m_value;}
+    void SetValue(std::string &value) {m_value = value;}
+
+    // parameters of a function (ndlTypFunction), or parameters in the call to a macro
+    void SetParamString(ConfigValue paramString) {m_paramString = paramString;} 
+    ConfigArray GetParamString() const {return m_paramString;}
+
+    // parameters of a macro
+    void SetParamMacro(ConfigValue paramMacro) {m_paramMacro = paramMacro;} 
+    ConfigArray GetParamMacro() const {return m_paramMacro;}
+
+    void SetParentScript(NDLScript<ElemType>* script) {m_parent = script;}
+    NDLScript<ElemType>* GetParentScript() { return m_parent; } 
+
+    // get parameters, either just optional or just regular
+    vector<NDLNode*> GetParameters(bool optional=false) const
+    { 
+        vector<NDLNode*> result;
+        for (NDLNode* param : m_parameters)
+        {
+            bool optParam = param->GetType() == ndlTypeOptionalParameter;
+            if (optParam == optional)
+                result.push_back(param);
+        }
+        return result;
+    }
+    
+    // Get/Set eval values
+    void* GetEvalValue() const { return m_eval;}
+    void SetEvalValue(void* evalValue) {m_eval = evalValue;}
+
+    // GetOptionalParameter - Get an optional parameter value
+    // name - the name to search for in the optional parameters
+    // deflt - the default value (if not found)
+    // returns: parameter value if found, or default value otherwise
+    ConfigValue GetOptionalParameter(const std::string& name, const std::string& deflt) const
+    {
+        for (NDLNode* param : m_parameters)
+        {
+            bool optParam = param->GetType() == ndlTypeOptionalParameter;
+            if (optParam && !_stricmp(param->GetName().c_str(), name.c_str()))
+            {
+                auto paramValue = param->GetValue();
+                auto resolveParamNode = m_parent->ParseVariable(paramValue, false);
+                if (resolveParamNode != nullptr)
+                    return resolveParamNode->GetScalar();
+                else
+                    return paramValue;
+            }
+        }
+        return ConfigValue(deflt);
+    }
+
+    // FindNode - Find a node of the given name
+    // name - name to search for
+    // searchForDotNames - search for NDL symbols traversing call heirarchy
+    // returns: The node with that name, or NULL if not found
+    NDLNode* FindNode(const std::string& name, bool searchForDotNames=false)
+    {
+        NDLNode* found = m_parent->FindSymbol(name, searchForDotNames);
+        if (!found)
+            found = NDLScript<ElemType>::GlobalScript().FindSymbol(name, searchForDotNames);
+        return found;
+    }
+
+    // GetScalar - Get a scalar value from a node, may loop through some variables before arriving
+    // returns: scalar value
+    ConfigValue GetScalar()
+    {
+        NDLNode<ElemType>* node = this;
+        while (node && (node->GetType() == ndlTypeVariable || node->GetType() == ndlTypeParameter))
+        {
+            NDLNode<ElemType>* nodeLast = node;
+            node = node->FindNode(node->GetValue(), true /*searchForDotNames*/);
+
+            // if we are still on the same node, that means it was never resolved to anything, an undefined variable
+            if (nodeLast == node)
+            {
+                RuntimeError("undefined Variable, '%s' found, must be declared before first use\n", node->GetName().c_str());
+            }
+        }
+        if (!node || node->GetType() != ndlTypeConstant)
+        {
+            std::string name = node ? node->GetName() : GetName();
+            RuntimeError("Scalar expected, '%s' must be a constant or variable that resolves to a constant\n", name.c_str());
+        }
+        return node->GetValue();
+    }
+
+    void InsertParam(NDLNode* param) {m_parameters.push_back(param);}
+
+    // EvaluateMacro - Evaluate a macro, make the call
+    // nodeEval - the node evaluator we are using to interpret the script
+    // baseName - base name for all symbols at this level
+    // pass - what NDLPass are we in?
+    // returns: the return node for this macro
+    NDLNode<ElemType>* EvaluateMacro(NDLNodeEvaluator<ElemType>& nodeEval, const wstring& baseName, const NDLPass pass)
+    {
+        if (m_type != ndlTypeMacroCall)
+            return NULL;
+
+        // make sure the actual parameters and expected parameters match
+        if (m_parameters.size() < m_paramMacro.size())
+        {
+            RuntimeError("Parameter mismatch, %d parameters provided, %d expected in call to %s\n",
+                m_parameters.size(),m_paramMacro.size(),m_value.c_str());
+        }
+
+        // assign the actual parameters in the script so we can execute it
+        for (int i=0; i < m_parameters.size(); ++i)
+        {
+            NDLNode<ElemType>* nodeParam = m_parameters[i];
+            std::string paramName = i < m_paramMacro.size()?m_paramMacro[i]:nodeParam->GetName();
+
+            // if the node is a parameter then look it up in the symbol table
+            if (nodeParam->GetType() == ndlTypeParameter)
+            {
+                nodeParam = m_parent->FindSymbol(nodeParam->GetName());
+            }
+            // do we want to add optional parameters as symbols, or not?
+            else if (nodeParam->GetType() == ndlTypeOptionalParameter)
+            {
+                if (i < m_paramMacro.size())
+                    RuntimeError("Parameter mismatch, parameter %d is an optional parameter, but should be a required parameter\n",i);
+                // if no symbol yet, add it
+                if (!m_script->ExistsSymbol(paramName))
+                {
+                    m_script->AddSymbol(paramName, nodeParam);
+                    continue;
+                }
+                //else assign the value below
+            }
+
+            // assign the parameter symbols in the script we will call with the values passed to the call
+            m_script->AssignSymbol(paramName, nodeParam);
+
+        }
+
+        std::wstring newBase = baseName;
+        if (!newBase.empty())
+            newBase += L".";
+        newBase += msra::strfun::utf16(m_name);
+
+        // now evaluate  the contained macro script
+        NDLNode<ElemType>* nodeResult = m_script->Evaluate(nodeEval, newBase, pass);
+        // Consider: do we need to restore the original mapping here, may need to for recursive calls?
+
+        // look for a symbol that is identical to the macro name, if it exists this is the return value
+        NDLNode<ElemType>* nodeMacroName = m_script->FindSymbol(m_value);
+        if (nodeMacroName)
+        {
+            nodeResult = nodeMacroName;
+        }
+
+        // set the eval node to be the same as the return value;
+        if (nodeResult)
+        {
+            m_eval = nodeResult->GetEvalValue();
+        }
+        return nodeResult;
+    }
+};
+
+template <typename ElemType>
+class NDLScript: public ConfigParser
+{
+private:
+    std::wstring m_baseName;
+    std::string m_scriptString;
+    std::vector<NDLNode<ElemType>*> m_script; // script lines in parsed node order, macros will have definition followed by body
+    std::map<std::string, NDLNode<ElemType>*, nocase_compare> m_symbols; // symbol table
+    NDLNode<ElemType>* m_macroNode; // set when interpretting a macro definition
+    bool m_noDefinitions; // no definitions can be made in this script, interpret all macro/function names as calls
+    static NDLScript<ElemType> s_global; //("global"); // global script for storing macros and global nodes
+    std::vector<NDLNode<ElemType>*> m_children; // child nodes. Note that m_script nodes may not be children of this object, they include macro nodes
+    ComputationNetwork<ElemType>* m_cn; // computation network to use for backup symbol lookup. Used for MEL where NDL and network nodes are mixed
+    bool m_definingMacro; // currently defining a macro, flag to determine if we are defining or interpretting a macro call
+
+public:
+    // constructors that take a config name
+    NDLScript(const std::string & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; }
+    NDLScript(const std::wstring & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; }
+    ~NDLScript() 
+    {
+        // need to free all the child nodes attached to this script node
+        for (NDLNode<ElemType>* node : m_children)
+        {
+            delete node;
+        }
+        m_children.clear();
+    }
+
+    // empty constructor 
+    NDLScript() : ConfigParser(';') { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } // parameterless version if needed
+
+    // construct NDLScript from a ConfigValue, propogate the config Name
+    NDLScript(const ConfigValue& configValue) : ConfigParser(';',configValue.Name())
+    {
+        m_macroNode = NULL;
+        m_noDefinitions=false;
+        m_definingMacro = false;
+        m_scriptString = configValue;
+        Parse(m_scriptString);
+    }
+
+    // construct NDLScript from a ConfigValue, propogate the config Name
+    // configValue - the body of the macro
+    // oneLineDefinition - this macro definition is all on one line, names optional
+    // macroName - if the macro has a name, the name - this is used to get parameter info
+    NDLScript(const ConfigValue& configValue, std::string macroName, bool oneLineDefinition) : ConfigParser(';',configValue.Name())
+    {
+        m_noDefinitions = oneLineDefinition;
+        m_definingMacro = true;
+        m_macroNode = NULL;
+        m_scriptString = configValue;
+        NDLNode<ElemType>* ndlNode = s_global.CheckName(macroName, true);
+        if (ndlNode == NULL)
+            RuntimeError("Invalid macro definition, %s not found", macroName.c_str());
+
+        // get and parse the parameters
+        ConfigArray parameters = ndlNode->GetParamMacro();
+        for (auto iter = parameters.begin(); iter != parameters.end(); ++iter)
+        {
+            // we are adding parameters that will be replaced by actual values later
+            ConfigValue param = *iter;
+
+            // check to make sure this parameter name is not a reserved word
+            std::string functionName = param;
+            // check for function name, a function may have two valid names
+            // in which case 'functionName' will get the default node name returned
+            if (CheckFunction<ElemType>(functionName))
+            {
+                RuntimeError("NDLScript: Macro %s includes a parameter %s, which is also the name of a function. Parameter names may not be the same as function names.", macroName.c_str(), param.c_str());
+            }
+
+            NDLNode<ElemType>* paramNode = new NDLNode<ElemType>(param, param, this, ndlTypeParameter);
+            // add to node parameters
+            ndlNode->InsertParam(paramNode);
+            // add to script symbol table
+            AddSymbol(param, paramNode);
+        }
+        Parse(m_scriptString);
+        m_definingMacro = false;
+    }
+
+
+    // copy and move constructors
+    NDLScript(const NDLScript& copyMe);
+    NDLScript(const NDLScript&& moveMe);
+private:
+    NDLNode<ElemType>* DuplicateNode(NDLNode<ElemType>* node);
+public:
+    // GlobalScript - Access to global script
+    static NDLScript<ElemType>& GlobalScript() {return s_global;}
+
+    // SetMacroDefinitionsAllowed - allow macro definitions
+    // macroAllowed - can macros be defined in this script?
+    void SetMacroDefinitionsAllowed(bool macroAllowed)
+    {
+        m_noDefinitions = !macroAllowed;
+    }
+
+    void SetBaseName(const std::wstring& baseName)
+    {
+        m_baseName = baseName;
+    }
+    const std::wstring& GetBaseName()
+    {
+        return m_baseName;
+    }
+
+    void ClearGlobal()
+    {
+        s_global.Clear();
+    }
+
+    void Clear()
+    {
+
+        for (NDLNode<ElemType>* node : m_children)
+        {
+            delete node;
+        }
+        m_children.clear();
+        for (NDLNode<ElemType>* node : m_script)
+        {
+            delete node;
+        }
+        m_script.clear();
+
+        m_symbols.clear();
+    }
+    void ClearEvalValues()
+    {
+        for (NDLNode<ElemType>* node : m_children)
+        {
+            node->SetEvalValue(NULL);
+        }
+    }
+    // AddChild - add a child node to the script
+    // node - node to add
+    // NOTE: this NDLScript owns this node and is responsible to delete it
+    void AddChild(NDLNode<ElemType>* node) 
+    {
+        m_children.push_back(node);
+    }
+
+    // SetComputationNetwork - set the computation network this NDL is associated with
+    void SetComputationNetwork(ComputationNetwork<ElemType>* cn)
+    {
+        m_cn = cn;
+    }
+
+    // FindSymbol - Find a symbol to the symbol table
+    // symbol - symbol to find
+    // searchForDotNames - search for NDL symbols traversing call heirarchy
+    // returns - node this symbol references
+    NDLNode<ElemType>* FindSymbol(const std::string& symbol, bool searchForDotNames=true)
+    {
+        auto found = m_symbols.find(symbol);  //search symbol directly first
+        if (found != m_symbols.end())
+            return found->second;
+
+        // if not found, handle dot names by move up the hierarchy
+        size_t firstDot = symbol.find_first_of('.');
+        if (firstDot == npos)
+            return nullptr;
+
+        std::string search = symbol.substr(0,firstDot);
+        found = m_symbols.find(search);
+        if (found == m_symbols.end())
+        {
+            return NULL;
+        }
+
+        // handle dot names, 
+        if (firstDot != npos)
+        {
+            NDLNode<ElemType>* node = found->second;
+            NDLScript<ElemType>* script = node->GetScript();
+            // if there is no script, probably a parameter/variable with further 'dot' values (ie. var.CE.BFF)
+            if (script != NULL)
+            {
+                if (node->GetType() != ndlTypeMacroCall || script == NULL)
+                    RuntimeError("Symbol name not valid, %s is not a macro, so %s cannot be interpretted",search.c_str(),symbol.c_str() );
+                return script->FindSymbol(symbol.substr(firstDot+1), searchForDotNames);
+            }
+        }
+        return found->second;
+    }
+
+    // ExistsSymbol - Find if a symbol exists (value might be NULL)
+    // symbol - symbol to find
+    // returns - true if it's there
+    bool ExistsSymbol(const std::string& symbol)
+    {
+        auto found = m_symbols.find(symbol);
+        return (found != m_symbols.end());
+    }
+
+    // ContainsOptionalParameter - do any nodes in this script have an optional parameter by the following name?
+    // optParamName - name of parameter we are searching for
+    // returns: vector of the nodes found (empty if nothing found)
+    vector<NDLNode<ElemType>*> ContainsOptionalParameter(const std::string& optParamName)
+    {
+        vector<NDLNode<ElemType>*> result;
+        std::string empty;
+        for (auto symbol : m_symbols)
+        {
+            NDLNode<ElemType>* node = symbol.second;
+            std::string value = node->GetOptionalParameter(optParamName, empty);
+            if (!value.empty())
+            {
+                result.push_back(node);
+            }
+        }
+        return result;
+    }
+
+    // AddSymbol - Add a symbol to the symbol table
+    // symbol - symbol to add
+    // node - node this symbol references
+    // NOTE: at present we don't allow reuse of a symbol, so this throws an error if it sees an existing symbol
+    void AddSymbol(const std::string& symbol, NDLNode<ElemType>* node)
+    {
+        auto found = m_symbols.find(symbol);
+        if (found != m_symbols.end())
+        {
+            NDLNode<ElemType>* nodeFound = found->second;
+            // check for undetermined nodes, because these nodes are to be defined later
+            if (nodeFound->GetType() != ndlTypeUndetermined && nodeFound->GetType() != ndlTypeParameter)
+            {
+                std::string value = found->second->GetValue();
+                RuntimeError("Symbol '%s' currently assigned to '%s' reassigning to a different value not allowed\n", symbol.c_str(), value.c_str());
+            }
+        }
+        m_symbols[symbol] = node;
+    }
+
+    // AssignSymbol - Assign a new value to a symbol in the table
+    // symbol - symbol to assign
+    // node - node this symbol will reference
+    void AssignSymbol(const std::string& symbol, NDLNode<ElemType>* node)
+    {
+        auto found = m_symbols.find(symbol);
+        if (found == m_symbols.end())
+        {
+            RuntimeError("Symbol '%s' currently does not exist, attempting to assigned value '%s' AssignSymbol() requires existing symbol\n", symbol.c_str(), node->GetValue().c_str());
+        }
+        m_symbols[symbol] = node;
+    }
+
+
+    // FileParse - parse at the file level, can be overridden for "section of file" behavior
+    // stringParse - file concatentated as a single string
+    void FileParse(const std::string& stringParse)
+    {
+        ConfigParameters sections(stringParse);
+        bool loadOrRunFound = false;
+
+        // load all the sections that we want (macros)
+        if (sections.Exists("load"))
+        {
+            auto config = ConfigArray(sections("load"));
+            for (int i=0; i < config.size(); ++i)
+            {
+                Parse(sections(config[i]));
+            }
+            loadOrRunFound = true;
+        }
+
+        // load and then execute 
+        if (sections.Exists("run"))
+        {
+            auto config = ConfigArray(sections("run"));
+            for (int i=0; i < config.size(); ++i)
+            {
+                Parse(sections(config[i]));
+            }
+            loadOrRunFound = true;
+        }
+
+        // didn't find any of the tags, so just parse the whole thing as a script
+        if (!loadOrRunFound)
+        {
+            // surround text in braces so we parse correctly
+            std::string textInBraces = "[ "+stringParse+" ]";
+            Parse(textInBraces);
+        }
+    }
+
+    // IsMacroDefinition - is this a macro definition?
+    // returns - true if a definition, otherwise false
+    bool IsMacroDefinition()
+    {
+        return m_definingMacro;
+    }
+
+    // CheckName - check for a name in our symbols, see if it exists
+    // name - name we are looking for
+    // localOnly - only look in the current scope, and not the global scope
+    // if it does exist return the node that represents the name
+    NDLNode<ElemType>* CheckName(const std::string& name, bool localOnly=false)
+    {
+        // first try local script
+        auto found = FindSymbol(name);
+        if (found != NULL)
+        {
+            return found;
+        }
+
+        // next try the globals, this includes macros and global constants
+        if (!localOnly)
+        {
+            auto found = s_global.FindSymbol(name);
+            if (found != NULL)
+            {
+                NDLNode<ElemType>* node = found;
+                if (node->GetType() == ndlTypeMacro)
+                {
+                    // if we are calling a macro we need to keep track of formal parameters, 
+                    // keep them as strings in this macroCall node
+                    NDLNode<ElemType>* newNode = new NDLNode<ElemType>("", name, this, ndlTypeMacroCall);
+                    NDLScript<ElemType>* script = node->GetScript();
+
+                    // if this is a macro call (and not a definition), we want to expand the macro (make a copy)
+                    if (!IsMacroDefinition())
+                    {
+                        script = new NDLScript<ElemType>(*script);
+                    }
+                    newNode->SetScript(script);
+
+                    newNode->SetParamMacro(node->GetParamMacro());
+                    node = newNode;
+                }
+                return node;
+            }
+        }
+
+        std::string functionName = name;
+        // check for function name, a function may have two valid names
+        // in which case 'functionName' will get the default node name returned
+        if (CheckFunction<ElemType>(functionName))
+        {
+            NDLNode<ElemType>* ndlNode = new NDLNode<ElemType>("", functionName, this, ndlTypeFunction);
+            return ndlNode;
+        }
+
+        // not found, return NULL
+        return NULL;
+    }
+
+    // CallStringParse - parse the string description of a call sequence
+    // token - [in] string description of the call
+    // nameFunction - [out] name of the function being called
+    // params - [out] parameters to the function, set to empty string if no parameters
+    // returns: the node (if it exists) that matches this function name, otherwise NULL
+    NDLNode<ElemType>* CallStringParse(const std::string& token, std::string& nameFunction, std::string& params)
+    {
+        auto paramStart = token.find_first_of(OPENBRACES);
+        if (paramStart == npos)
+            RuntimeError("Invalid macro/function call can not be parsed: %s\n", token.c_str());
+        nameFunction = token.substr(0, paramStart);
+        Trim(nameFunction);
+        params = token.substr(paramStart);
+        NDLNode<ElemType>* ndlNodeFound = CheckName(nameFunction);
+        return ndlNodeFound;
+    }
+
+
+    // ParseParameters - parse the parameters of a macro, or an array
+    // ndlNode - node we should add the parameters to
+    // value - parameters as config value
+    // createNew - create a new parameter node if one does not exist
+    void ParseParameters(NDLNode<ElemType>* ndlNode, const ConfigValue& value, bool createNew=false)
+    {
+        ConfigArray parameters = value;
+        for (auto iter = parameters.begin(); iter != parameters.end(); ++iter)
+        {
+            ConfigValue param = *iter;
+            NDLNode<ElemType>* paramNode = NULL;
+            auto foundBrace = param.find_first_of(FUNCTIONOPEN);
+            if (foundBrace != npos) // a nested call as a parameter
+                paramNode = ParseCall(param);
+            else // must be predefined variable or constant
+            {
+                paramNode = ParseVariable(param, createNew);
+
+                // if we can't find the node right now, it's undetermined, must be defined later, or throw an error later
+                if (paramNode == nullptr)
+                {
+                    paramNode = new NDLNode<ElemType>(param, param, this, ndlTypeUndetermined);
+                    // add to the symbol table
+                    AddSymbol(param, paramNode);
+                }
+            }
+            if (paramNode == NULL)
+            {
+                RuntimeError("variable name '%s' not found, must be previously defined\n", param.c_str());
+            }
+            else
+            {
+                ndlNode->InsertParam(paramNode);
+            }
+        }
+    }
+
+    // ParseVariable - parse a variable or constant
+    // token - string containing the variable or constant
+    // createNew - create a new variable node if no node found
+    // returns: the node that represents this newly defined variable
+    NDLNode<ElemType>* ParseVariable(const std::string& token, bool createNew=true)
+    {
+        NDLNode<ElemType>* ndlNode = NULL;
+        auto openBrace = token.find_first_of(OPENBRACES);
+        if (openBrace == 0)
+        {
+            ndlNode = new NDLNode<ElemType>("", token, this, ndlTypeArray);
+            ndlNode->SetParamString(token);
+            ParseParameters(ndlNode, token);
+            return ndlNode;
+        }
+
+        auto found = token.find_first_not_of("+-.0123456789eE");
+        // see if it's a numeric constant
+        if (found == npos)
+        {
+            ndlNode = new NDLNode<ElemType>("", token, this, ndlTypeConstant);
+        }
+        // not a constant, so must be a variable
+        else
+        {
+            // look for an optional parameter
+            auto foundEqual = token.find_first_of('=');
+            bool optional = (foundEqual != npos);
+            if (optional)
+            {
+                std::string name = token.substr(0, foundEqual);
+                Trim(name);
+                std::string value = token.substr(foundEqual+1);
+                Trim(value);
+                
+                ndlNode = new NDLNode<ElemType>(name, value, this, ndlTypeOptionalParameter);
+            }
+            else
+            {
+                ndlNode = CheckName(token);
+                if (createNew && ndlNode == NULL)
+                {
+                    // NOTE: currently we only get here in Parameter scenarios, 
+                    // if other scenarios present themselves, need a good way to change the type
+                    ndlNode = new NDLNode<ElemType>(token, token, this, ndlTypeParameter);
+                    AddSymbol(token, ndlNode);
+                }
+            }
+        }
+        return ndlNode;
+    }
+
+    // ParseDefinition - parse a macro definition
+    // token - string containing the macro definition (without the macro body)
+    // returns: the node that represents this newly defined macro
+    NDLNode<ElemType>* ParseDefinition(const std::string& token)
+    {
+        std::string nameFunction, params;
+        NDLNode<ElemType>* ndlNode = CallStringParse(token, nameFunction, params);
+        if (ndlNode)
+            RuntimeError("function '%s' already defined\n", nameFunction.c_str());
+        ndlNode = new NDLNode<ElemType>(nameFunction, params, &s_global, ndlTypeMacro);
+
+        // now set the variables/parameters which will be parsed when the body shows up
+        ndlNode->SetParamMacro(params);
+
+        // now add this to the globals
+        s_global.AddSymbol(nameFunction,ndlNode);
+
+        // NOTE: the body of the Macro will be parsed separately, this just sets up the node
+        return ndlNode;
+    }
+
+    // ParseCall - parse the call syntax out into "function" and variables
+    // token - string containing the "call"
+    // return - Node pointer, the newly created node 
+    NDLNode<ElemType>* ParseCall(const std::string& token)
+    {
+        std::string nameFunction, params;
+        NDLNode<ElemType>* ndlNode = CallStringParse(token, nameFunction, params);
+
+        if (ndlNode == NULL)
+            RuntimeError("Undefined function or macro '%s' in %s\n", nameFunction.c_str(), token.c_str());
+
+        // now setup the variables/parameters
+        ConfigValue value = ConfigValue(params, nameFunction);
+
+        ndlNode->SetParamString(value);
+        ParseParameters(ndlNode, value);
+        return ndlNode;
+    }
+
+    // parse a 'key=value' pair and create the appropriate node for what was seen
+    // 'key=Function(x,y,z)' - function
+    // 'macro(x,y)={z=Input(x,y)}
+    // may also be Function(x,y,z), a nameless call (used in one-line macros)
+    std::string::size_type ParseValue(const std::string& stringParse, std::string::size_type tokenStart, std::string::size_type tokenEnd)
+    {
+        // first find previous character
+
+        // skip leading spaces
+        tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
+        auto keyEnd = stringParse.find_first_of(OPENBRACES"=", tokenStart);
+        bool equalFound = (keyEnd != npos && keyEnd < tokenEnd && stringParse[keyEnd] == '=');
+
+        // this should be the body of the macro
+        if (m_macroNode)
+        {
+            bool oneLineDefinition = false;
+            NDLNode<ElemType>* macroNode = m_macroNode;
+
+            // an '=' at the beginning, skip it
+            if (keyEnd == tokenStart && equalFound)
+            {
+                // skip the '=' sign
+                oneLineDefinition = true;
+                tokenStart = stringParse.find_first_not_of(" \t", tokenStart+1);
+                if (tokenStart == npos)
+                    RuntimeError("Body of Macro missing");
+            }
+
+            NDLScript<ElemType>* script = new NDLScript<ElemType>(ConfigValue(stringParse.substr(tokenStart, tokenEnd-tokenStart), macroNode->GetName()), macroNode->GetName(), oneLineDefinition);
+            macroNode->SetScript(script);
+
+            // reset so we know we are done with the body
+            m_macroNode = NULL;
+
+            return tokenEnd;    // done with the macro now
+        }
+
+        // if we hit the end of the token before we hit an equal sign, it's a 'macro(x,y)' definition
+        // unless we are a one-line macro in which case we don't allow definitions
+        if (!m_noDefinitions && !equalFound)
+        {
+            keyEnd = stringParse.find_first_of(OPENBRACES, tokenStart);
+            if (keyEnd == npos || keyEnd >= tokenEnd)
+                RuntimeError("Invalid statement, does not contain an '=' sign: %s\n", stringParse.substr(tokenStart, tokenEnd-tokenStart).c_str());
+            m_macroNode = ParseDefinition(stringParse.substr(tokenStart, tokenEnd-tokenStart));
+            // the body of the macro will come through next time
+            return tokenEnd;
+        }
+
+        // get the key value (symbol name)
+        std::string key;
+
+        // no macro definitions allowed, so no equal means a function call
+        if (m_noDefinitions && !equalFound)
+        {
+            ;// nothing to do here, just skip the "key=" parsing below
+        }
+        else
+        {
+            key = stringParse.substr(tokenStart, keyEnd-tokenStart);
+            Trim(key);
+
+            // check to make sure variable name isn't a valid function name as well
+            string strTemp = key;
+            if (CheckFunction<ElemType>(strTemp))
+                RuntimeError("variable %s is invalid, it is reserved because it is also the name of a function", key.c_str());
+
+            tokenStart = keyEnd;
+            if (stringParse[keyEnd] == '=')
+                ++tokenStart;
+
+            // skip any spaces before the second token
+            tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
+        }
+        std::string::size_type substrSize = tokenEnd - tokenStart;
+
+        auto bracesEnd = FindBraces(stringParse, tokenStart);
+
+        // if braces found, we modify the token end according to braces
+        if (bracesEnd != npos)
+        {   // include the trailing brace
+            tokenEnd = bracesEnd+1;
+            substrSize = tokenEnd - tokenStart;
+
+            // for quote delimited string remove quotes
+            if (stringParse[tokenStart] == '"')
+            {
+                tokenStart++;
+                substrSize -= 2;    // take out the quotes
+            }
+        }
+
+        if (substrSize == 0)
+            return npos;
+
+        // get the value
+        std::string value = stringParse.substr(tokenStart, substrSize);
+        Trim(value);
+
+        NDLNode<ElemType>* ndlNode = NULL;
+
+        // check for a function/macro call
+        auto found = value.find_first_of(FUNCTIONOPEN);
+        if (found != npos && found > 0) // brace found after some text, so a call
+        {
+            ndlNode = ParseCall(value);
+            // check if we have a user defined name, ParseCall assigns a default name
+            if (!key.empty())
+                ndlNode->SetName(key);
+            AddSymbol(ndlNode->GetName(),ndlNode);
+            m_script.push_back(ndlNode);
+        }
+        // if it's not a call, must be a variable
+        else
+        {
+            ndlNode = ParseVariable(value);
+            bool newNode = ndlNode->GetName().empty();
+            AddSymbol(key,ndlNode);
+
+            ndlNode->SetName(key);
+            if (newNode) //only need to add nodes that are new (not renames)
+            {
+                m_script.push_back(ndlNode);
+            }
+        }
+
+        return tokenEnd;
+    }
+
+    // ExpandMacro - Expand a macro into a new macro definition
+    // node - NDLNode that holds the macro call
+    // returns: new node with the expanded macro
+    NDLNode<ElemType>* ExpandMacro(const NDLNode<ElemType>* node)
+    {
+        assert(node->GetType() == ndlTypeMacroCall); // needs to be a macro call (not definition)
+
+        std::string name = node->GetName();
+        // if we are calling a macro make a new copy of it and execute that instead (macro expansion)
+        // we do this so the evalValues in the macros will be valid regardless of number of instantiations
+        NDLNode<ElemType>* newNode = new NDLNode<ElemType>(name, node->GetValue(), this, ndlTypeMacroCall);
+        NDLScript<ElemType>* newScript = new NDLScript<ElemType>(*node->GetScript());
+        newNode->SetScript(newScript);
+        newNode->SetParamMacro(node->GetParamMacro());
+
+        // now get the parameters to the macro added
+        ConfigValue paramString = node->GetParamString();
+        ParseParameters(newNode, paramString, true /*createNew*/);
+        newNode->SetParamString(paramString);
+
+        // fixup the symbol table to point to this one instead
+        AssignSymbol(name, newNode);
+        return newNode;
+    }
+
+    // Evaluate - Evaluate the script
+    // nodeEval - the node evaluator to call
+    // baseName - baseName for all labels
+    // pass - what NDLPass are we on?
+    // skipThrough - skip through this node, will skip eval for all nodes up to and including this one
+     NDLNode<ElemType>*  Evaluate(NDLNodeEvaluator<ElemType>& nodeEval, const wstring& baseName, const NDLPass pass=ndlPassInitial, NDLNode<ElemType>* skipThrough=nullptr)
+    {
+        NDLNode<ElemType>* nodeLast = skipThrough;
+        bool skip = skipThrough != nullptr;
+        std::wstring prevBaseName = GetBaseName();
+        SetBaseName(baseName);
+
+        for (auto& node : m_script)
+        {
+            // if we are in skip mode, and we found the skipThrough node, 
+            // move out of skip mode and start processing at next node
+            if (skip)
+            {
+                if (node == skipThrough)
+                    skip = false;
+                continue;
+            }
+
+            // if it's a macro call, call the macro
+            if (node->GetType() == ndlTypeMacroCall)
+            {
+                node->EvaluateMacro(nodeEval, baseName, pass);
+                nodeEval.ProcessOptionalParameters(node);
+            }
+            else
+            {
+                nodeEval.Evaluate(node, baseName, pass);
+            }
+            nodeLast = node;
+        }
+        SetBaseName(prevBaseName);
+        return nodeLast;
+    }
+};
+
+}}}
diff --git a/MachineLearning/cn/SGD.h b/MachineLearning/cn/SGD.h
index bffa9dcea..81de3cf69 100644
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
@@ -1,1564 +1,1587 @@
-//
-// <copyright file="SGD.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include "basetypes.h"
-#include "ComputationNetwork.h"
-#include "ComputationNetworkHelper.h"
-#include "SimpleEvaluator.h"
-#include "DataReader.h"
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include "fileutil.h"
-#include "commandArgUtil.h"
-#include <chrono> 
-#include <random>
-
-#ifdef MPI_SUPPORT
-#include "mpi.h"
-#endif
-extern int myRank;
-extern int numProcs;
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    template<class ElemType>
-    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb)
-    {
-        size_t rv = 0;
-        if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it)
-        {
-            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-            size_t nCols = mat.GetNumCols();
-            size_t col_start = (nCols * myRank) / numProcs;
-            size_t col_end = (nCols*(myRank + 1)) / numProcs;
-            if (col_end > nCols) col_end = nCols; // this shouldn't happen
-            if (col_end == col_start)
-            {
-                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
-                mat.SetValue(tmp);
-            }
-            else
-            {
-                MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
-                mat.SetValue(tmp);
-            }
-            if (0 == rv)
-            {
-                rv = mat.GetNumCols();
-            }
-            else
-            {
-                if (rv != mat.GetNumCols())
-                    throw std::logic_error("Uneven number of columns among inputs.");
-            }
-        }
-    }
-
-    enum class LearningRateSearchAlgorithm : int
-    {
-        None,
-        AdjustAfterEpoch,
-        SearchBeforeEpoch
-    };
-
-    enum class AdaptationRegType : int
-    {
-        None,
-        KL
-    };
-
-    enum class GradientsUpdateType : int 
-    {
-        None,
-        AdaGrad,
-        RmsProp
-    };
-    
-    // configuration parameters associated with RMSProp learning algorithm
-    typedef struct stRMSPropInfo{
-        double gamma;
-        double inc;
-        double dec;
-        double max;
-        double min;
-        stRMSPropInfo()
-        {
-            gamma = 0.99;
-            inc = 1.2;
-            dec = 0.75;
-            max = 10.0;
-            min = 0.1;
-        }
-    }RMSPropInfo;
-
-    typedef struct stGradientUpdateInfo{
-        GradientsUpdateType mType;
-        float mGaussianNoiseInjectStd;
-        stGradientUpdateInfo()
-        {
-            mType = GradientsUpdateType::AdaGrad;
-            mGaussianNoiseInjectStd = 0.0075f;
-        }
-    } GradientUpdateInfo;
-
-    template<class ElemType>
-    class SGD : ComputationNetworkHelper<ElemType>
-    {
-    protected:
-        typedef ComputationNetworkHelper<ElemType> B;
-        using B::SetMaxTempMemSizeForCNN; using B::SetDropoutRate; using B::UpdateEvalTimeStamps;
-        typedef ComputationNode<ElemType>* ComputationNodePtr;
-        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
-
-    public:
-        SGD(const ConfigParameters& configSGD)
-        {
-            ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
-            floatargvector learningRatesPerMB = learningRatesPerMBStr;
-
-            ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
-            floatargvector learningRatesPerSample = learningRatesPerSampleStr;
-
-            std::string executionEngineValue = configSGD("executionEngine", "synchronous");
-
-#ifdef USE_PTASK
-            // use PTask if we have more than one GPU or the MultiGPU flag is set
-            bool usePtask = (g_bestGpu != NULL && g_bestGpu->UseMultiple()) || (bool)configSGD("MultiGPU", "false");
-#else
-            bool usePtask = false;
-#endif
-            // AutoAdjust Parameters
-            ConfigParameters configAALR (configSGD("AutoAdjust",""));
-            LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None"));
-            ElemType reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0");
-            bool continueReduce = (bool)configAALR("continueReduce", "false");
-            size_t learnRateAdjustInterval = (size_t)configAALR("learnRateAdjustInterval", "1");
-            ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618");
-            ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");// std::numeric_limits<ElemType>::infinity());
-            ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382");
-            ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500");
-            intargvector numMiniBatch4LRSearch = minibatch4LRSearch;
-            size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5");
-            size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1");
-            bool loadBestModel = configAALR("loadBestModel", "true");
-
-            ConfigArray minibatchSize = configSGD("minibatchSize", "256");
-            intargvector mbSize = minibatchSize;
-            size_t epochSize = configSGD("epochSize", "0");
-
-            size_t maxEpochs = configSGD("maxEpochs");
-            ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
-            floatargvector momentumPerMB = momentumPerMBStr;
-
-            wstring modelPath = configSGD("modelPath");
-            wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
-            wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
-
-            size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0");
-
-            int traceLevel = configSGD("traceLevel", "0");
-            size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10");
-
-            bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false");
-
-            bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true");
-            ElemType clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF"); // std::numeric_limits<ElemType>::infinity());
-
-            ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0");
-            floatargvector dropoutRates = dropoutRatesStr;
-
-            GradientUpdateInfo gUpdateInfo; 
-            GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None"));
-            ElemType gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0");
-            gUpdateInfo.mType = gradUpdateType;
-            gUpdateInfo.mGaussianNoiseInjectStd = (float)gaussianNoiseInjecStd;
-            
-            // extract RMSProp parameters from config, if they exist. Default to reasonable values.
-            RMSPropInfo rpi;
-            rpi.dec = (double)configSGD("rms_wgt_dec", "0.75");
-            rpi.inc = (double)configSGD("rms_wgt_inc", "1.2");
-            rpi.min = (double)configSGD("rms_wgt_min", "0.1");
-            rpi.max = (double)configSGD("rms_wgt_max", "10.0");
-            rpi.gamma = (double)configSGD("rms_gamma", "0.99");
-
-            /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of 
-            /// useAdagrad=true
-            bool useAdagrad = configSGD("useAdagrad", "false");
-            if (useAdagrad)
-            {
-                gradUpdateType = GradientsUpdateType::AdaGrad;
-                gUpdateInfo.mType = gradUpdateType;
-            }
-
-            AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None"));
-            ElemType adaptationRegWeight = configSGD("adaptationRegWeight", "0");
-
-            /// gradient check setup
-            bool doGradientCheck = configSGD("gradientcheck", "false");
-            ElemType gradientCheckSigDigit = configSGD("sigFigs", "6");
-
-            bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
-
-            Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation, 
-                clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor, 
-                reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates,
-                loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
-                maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
-                trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
-                rpi, learnRateAdjustInterval);
-        }
-    
-        void setMomentum(float momentum)
-        {
-            m_momentumPerMB = (ElemType)momentum;
-        }
-
-        //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
-        void Init(const floatargvector& learningRatesPerMB, const floatargvector& learningRatesPerSample, const intargvector& mbSize, 
-            const size_t epochSize, const size_t maxEpochs, 
-            const wstring& modelPath, const floatargvector& momentumPerMB, const bool gradientClippingWithTruncation = true,
-            const ElemType clippingThresholdPerSample=std::numeric_limits<ElemType>::infinity(),
-            const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None, 
-            const ElemType increaseLearnRateIfImproveMoreThan = std::numeric_limits<ElemType>::infinity(), const ElemType learnRateIncreaseFactor = 1.382f,
-            const ElemType reduceLearnRateIfImproveLessThan=0, const bool continueReduce=false, const ElemType learnRateDecreaseFactor = 0.618f, floatargvector dropoutRates = floatargvector(L"0.0f"),
-            const bool loadBestModel=true, const intargvector& numMiniBatch4LRSearch=intargvector(L"500"), const size_t numPrevLearnRates = 5, 
-            const size_t numBestSearchEpoch = 1, const int traceLevel = 0,
-            const size_t numMBsToShowResult = 10, const size_t maxTempMemSizeInSamplesForCNN = 0,
-            const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
-            const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
-            const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
-            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1)
-        {
-            numPrevLearnRates;
-            m_mbSize=mbSize;
-            m_epochSize=epochSize;
-            if (m_epochSize == 0)
-            {
-                m_epochSize = requestDataSize;
-            }
-            m_maxEpochs=maxEpochs;
-     
-            m_gradientClippingWithTruncation=gradientClippingWithTruncation;
-            m_modelPath=modelPath;
-            m_autoLearnRateSearchType=autoLearnRateSearchType;
-            m_traceLevel=traceLevel;
-            m_loadBestModel=loadBestModel;
-            m_increaseLearnRateIfImproveMoreThan=increaseLearnRateIfImproveMoreThan;
-            m_learnRateIncreaseFactor=learnRateIncreaseFactor;
-            m_reduceLearnRateIfImproveLessThan=reduceLearnRateIfImproveLessThan;
-             m_continueReduce=continueReduce;
-             m_learnRateAdjustInterval = max(1, learnRateAdjustInterval); //minimum interval is 1 epoch
-            m_learnRateDecreaseFactor=learnRateDecreaseFactor;
-            m_clippingThresholdPerSample=abs(clippingThresholdPerSample);
-            m_numMiniBatch4LRSearch=numMiniBatch4LRSearch;
-            m_dropoutRates=dropoutRates;
-            m_numMBsToShowResult=int(numMBsToShowResult);
-            m_numBestSearchEpoch=numBestSearchEpoch;
-            m_maxTempMemSizeInSamplesForCNN=maxTempMemSizeInSamplesForCNN;
-            m_gradType = gradUpdateType;
-            m_rpi = rpi;
-            m_usePtask = usePtask;
-            m_keepCheckPointFiles = keepCheckPointFiles;
-
-            m_adaptationRegType = adaptationRegType;
-            m_adaptationRegWeight = adaptationRegWeight;
-
-            m_trainCriterionNodeName = trainCriterionNodeName;
-            m_evalCriterionNodeName = evalCriterionNodeName;
-
-            for (size_t i=0; i<m_mbSize.size(); i++)
-                if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
-                    throw std::invalid_argument ("epoch size must be larger than mbsize.");
-
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None && (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
-            {
-                throw std::invalid_argument ("If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter.");
-            }
-
-            if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
-            {
-                throw std::invalid_argument ("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them.");
-            }
-            else if (learningRatesPerSample.size() > 0)
-            {
-                m_learningRatesPerSample=learningRatesPerSample;
-            }
-            else if (learningRatesPerMB.size() > 0)
-            {
-                int LRSize = (int)max(learningRatesPerMB.size(), m_mbSize.size());
-                m_learningRatesPerSample.resize(LRSize);
-                for (int i=0; i<LRSize; i++)
-                {
-                    m_learningRatesPerSample[i] = learningRatesPerMB[i]/m_mbSize[i];
-                }
-            }
-            m_momentumPerMB = 0.9f;
-            if  (momentumPerMB.size() >0)
-            {
-                m_momentumInputPerMB=momentumPerMB;
-                if (m_momentumInputPerMB[0]>=1 || m_momentumInputPerMB[0]<0)
-                    throw std::invalid_argument ("momentumPerMB must be in [0, 1).");
-            }
-
-            if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor<1)
-            {
-                throw std::invalid_argument ("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
-            }
-
-            for (size_t i=0; i<m_dropoutRates.size(); i++)
-            {
-                if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
-                {
-                    throw std::invalid_argument ("dropoutRate must be >= 0 and < 1.");
-                }
-            }
-
-            if (m_adaptationRegWeight > 1 || m_adaptationRegWeight <0)
-                throw invalid_argument("adaptationRegWeight must be in [0 1]");
-
-            m_minLearnRate = 1e-9f;
-
-            m_needRegularization = false;
-
-            m_doGradientCheck = doGradientCheck;
-            m_gradientCheckSigDigit = gradientCheckSigDigit;
-            m_validateAfterModelReloading = validateAfterModelReloading;
-
-            msra::files::make_intermediate_dirs (m_modelPath);
-        }
-
-        void Adapt(wstring origModelFileName, wstring refNodeName, IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader, const DEVICEID_TYPE deviceID, const bool makeMode = true)
-        {
-            if (origModelFileName == L"" || trainSetDataReader == nullptr)
-                    throw std::invalid_argument ("origModel and trainSetDataReader should not be null.");
-
-            int startEpoch = DetermineStartEpoch(makeMode);
-            if (startEpoch == m_maxEpochs)
-            {
-                fprintf(stderr, "Final model exists. No further training is necessary.\n");
-                return;
-            }
-
-            ComputationNetwork<ElemType> net(deviceID);
-            if (startEpoch >= 0)
-            {
-                wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1);
-                fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-                net.LoadFromFile(modelFileName);
-            }
-            else
-            {
-                fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
-                net.LoadFromFile(origModelFileName);
-            }
-
-            startEpoch = max(startEpoch, 0);
-
-            ComputationNetwork<ElemType> refNet(deviceID);
-            m_needRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
-            if (m_needRegularization)
-            {
-                fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
-                refNet.LoadFromFile(origModelFileName);
-            }
-
-            ComputationNodePtr refNode = nullptr;
-            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL)
-            {
-                fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
-                if (refNodeName == L"")
-                    throw invalid_argument("refNodeName does not exist and is needed when adaptationRegType is KL.");
-
-                refNode = refNet.GetNodeFromName(refNodeName);
-            }
-            
-            TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
-        }
-
-        void Train(IComputationNetBuilder<ElemType>* netBuilder, IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader, const bool makeMode = true)
-        {
-            if (netBuilder == nullptr || trainSetDataReader == nullptr)
-                    throw std::invalid_argument ("netBuilder and trainSetDataReader should not be null.\n");
-
-            int startEpoch = DetermineStartEpoch(makeMode);
-            if (startEpoch == m_maxEpochs)
-            {
-                fprintf(stderr, "Final model exists. No further training is necessary.\n");
-                return;
-            }
-
-            wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1);
-            if (startEpoch >= 0)
-                fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-            ComputationNetwork<ElemType>& net  = 
-                startEpoch<0? netBuilder->BuildNetworkFromDescription() : netBuilder->LoadNetworkFromFile(modelFileName);
-            // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
-            // strategy should be to run the initializer above on myRank==0, and then broadcast parameters.
-
-            startEpoch = max(startEpoch, 0);
-            m_needRegularization = false;
-
-            TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader);
-        }
-
-    protected:
-        std::vector<ComputationNodePtr>  GetTrainCriterionNodes(ComputationNetwork<ElemType>& net)
-        {
-            fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str());
-            if (!m_trainCriterionNodeName.empty())
-            {
-                std::vector<ComputationNodePtr> nodes;
-                ComputationNodePtr node = net.GetNodeFromName(m_trainCriterionNodeName);
-                net.ValidateNetwork(node);
-                if (node->FunctionValues().GetNumElements() != 1)
-                    throw invalid_argument("the trainCriterionNodeName specified in the config file is not a valid training criterion node.");
-
-                nodes.push_back(node);
-                return nodes;
-            }
-            else
-                return net.FinalCriterionNodes();
-        }
-        std::vector<ComputationNodePtr>  GetEvalCriterionNodes(ComputationNetwork<ElemType>& net)
-        {
-            fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str());
-            if (!m_evalCriterionNodeName.empty())
-            {
-                std::vector<ComputationNodePtr> nodes;
-                ComputationNodePtr node = net.GetNodeFromName(m_evalCriterionNodeName);
-                net.ValidateNetwork(node);
-                if (node->FunctionValues().GetNumElements() != 1)
-                    throw invalid_argument("the evalCriterionNodeName specified in the config file is not a valid evaluation criterion node.");
-
-                nodes.push_back(node);
-                return nodes;
-            }
-            else
-                return net.EvaluationNodes();
-        }
-
-        void TrainOrAdaptModel(int startEpoch, ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, ComputationNodePtr refNode,
-            IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader)
-        {
-            std::vector<ComputationNodePtr> & FeatureNodes = net.FeatureNodes();
-            std::vector<ComputationNodePtr> & labelNodes = net.LabelNodes();
-            std::vector<ComputationNodePtr> criterionNodes = GetTrainCriterionNodes(net);
-            std::vector<ComputationNodePtr> evaluationNodes = GetEvalCriterionNodes(net);
-
-            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
-            for (size_t i=0; i<FeatureNodes.size(); i++)
-            {
-                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
-            }
-            for (size_t i=0; i<labelNodes.size(); i++)
-            {
-                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();
-            }
-            
-            // special handling of classed based softmax node. Need a better solution to it.
-            if (criterionNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName() ||
-                evaluationNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName())
-            {
-                size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
-                int deviceId = FeatureNodes[0]->FunctionValues().GetDeviceId();
-                inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, (DEVICEID_TYPE)deviceId); 
-                inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, (DEVICEID_TYPE)deviceId);
-            }
-
-
-            //used for KLD regularized adaptation. For all other adaptation techniques use MEL to edit the model and using normal training algorithm
-            std::vector<ComputationNodePtr> refFeatureNodes;
-            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
-            {
-                refFeatureNodes.resize(FeatureNodes.size());
-                for (size_t i=0; i<FeatureNodes.size(); i++)
-                {
-                    refFeatureNodes[i] = refNet.GetNodeFromName(FeatureNodes[i]->NodeName()); //we need to keep this info to handle deletion
-                    refNet.ChangeNode(FeatureNodes[i]->NodeName(), FeatureNodes[i]); 
-                }
-
-                refNet.RebuildNetwork(refNode);
-            }
-
-            //initializing weights and gradient holder
-            std::list<ComputationNodePtr>& learnableNodes = net.LearnableNodes(criterionNodes[0]);  //only one criterion so far TODO: support multiple ones?
-            std::list<Matrix<ElemType>> smoothedGradients;
-
-            for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-            {
-                ComputationNodePtr node = (*nodeIter);
-                smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(),net.GetDeviceID()));
-            }
-
-            ElemType epochCriterion, avgCriterion, prevCriterion;
-            epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<ElemType>::infinity();
-            size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
-
-            std::vector<ElemType> epochEvalErrors(evaluationNodes.size(),std::numeric_limits<ElemType>::infinity());
-            
-            std::vector<wstring> evalNodeNames;
-            for (size_t i=0;i<evaluationNodes.size(); i++)
-                evalNodeNames.push_back(evaluationNodes[i]->NodeName());
-
-            size_t totalSamplesSeen = 0;
-            ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
-
-            int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
-            vector<ElemType> prevLearnRates;
-            prevLearnRates.resize(m_numPrevLearnRates);
-            for (int i=0; i<m_numPrevLearnRates; i++)
-                prevLearnRates[i] = std::numeric_limits<ElemType>::infinity();
-
-            //precompute mean and invStdDev nodes and save initial model
-            if (PreCompute(net, trainSetDataReader, FeatureNodes, labelNodes, inputMatrices) || startEpoch == 0)
-                if (0 == myRank) // only needs to be done by one process
-                    net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
-
-            bool learnRateInitialized = false;
-            if (startEpoch > 0)
-            {
-                learnRateInitialized = LoadCheckPointInfo(startEpoch-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);  
-                setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size()-1]);
-            }
-
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
-                throw std::invalid_argument ("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");
-
-            unsigned long dropOutSeed = 1;
-            ElemType prevDropoutRate = 0;
-
-            bool learnRateReduced = false;
-
-            SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
-            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) 
-                SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
-
-            // build the PTask graph if they want to use ptask
-            // NOTE: the graph is currently only for training, so other operations will still use the usual method, 
-            // (i.e rate adjustment and other custom operations still use the non PTask method)
-            if (m_usePtask)
-            {
-                // set the minibatch size to the largest thing we will ever see
-                int maxMbSize = 0;
-                for (int val : m_mbSize)
-                {
-                    maxMbSize = max(val, maxMbSize);
-                }
-                net.SetActualMiniBatchSize(maxMbSize);
-                net.BuildPTaskGraph();
-            }
-
-            for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
-            {
-                auto t_start_epoch = clock();
-
-                // set other information to inputMatrices that can contrain information
-                // used for class-based LM for clustring information
-                SetOtherInfo(net, trainSetDataReader, validationSetDataReader, inputMatrices);
-
-                //set dropout rate
-                SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-
-                //learning rate adjustment
-                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
-                {
-                    learnRatePerSample = m_learningRatesPerSample[i];
-                    setMomentum(m_momentumInputPerMB[i]);
-                }
-                else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
-                {
-                    ElemType largestPrevLearnRatePerSample = prevLearnRates[0];
-                    for (int j = 1; j < m_numPrevLearnRates; j++)
-                    {
-                        largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
-                    }
-
-                    //return a reasonable  learning rate based on the initial mbsize
-                    learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes,
-                        labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);
-
-                    prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;  //save per sample learn rate to support changeable mbsize
-                }
-
-                learnRateInitialized = true;
-
-                if (learnRatePerSample < m_minLearnRate)
-                {
-                    fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
-                    if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
-                        net.SaveToFile(m_modelPath);
-                    break;
-                }
-
-#ifdef MPI_SUPPORT
-                INT32 mySamples = (INT32)
-#endif
-                TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
-                    criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
-                    epochCriterion, epochEvalErrors, totalSamplesSeen);
-
-                auto t_end_epoch = clock();
-                ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
-
-                fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
-                if (epochEvalErrors.size() == 1)
-                {
-                    fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime);
-                }
-                else
-                {
-                    fprintf(stderr, "EvalErr Per Sample ");
-                    for (size_t j = 0; j < epochEvalErrors.size(); j++)
-                        fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
-                    fprintf(stderr, "Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", learnRatePerSample, epochTime);
-                    fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion);
-                    for (size_t j = 0; j < epochEvalErrors.size(); j++)
-                        fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
-                }
-
-#ifdef MPI_SUPPORT
-                // model reduction and averaging
-                if (numProcs > 0)
-                {
-                    ElemType factor; // weight for the parameter of my model
-                    {
-                        // compute total minibatch size
-                        INT32 allSamples = 0;
-                        MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-                        if (allSamples == 0) allSamples = 1;
-
-                        factor = (ElemType)mySamples / (ElemType)allSamples;
-                    }
-
-                    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-                    {
-                        ComputationNodePtr node = (*nodeIter);
-                        Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
-
-                        // weight model by relative size of minibatch samples (and number of processors, for averaging)
-                        ElemType *px = mat.CopyToArray();
-                        size_t nx = mat.GetNumElements();
-                        transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; });
-
-                        // TODO: Replace default Allreduce with the reduction-shuffle-dance
-                        vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
-                        MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-                        mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
-                        delete px;
-                    }
-                }
-#endif
-
-                if ( 0 == myRank ) // only evaluate once, on the master process. TODO: This could be faster by farming out the validation parts
-                if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
-                {
-                    SimpleEvaluator<ElemType> evalforvalidation(net);
-                    vector<wstring> cvSetTrainAndEvalNodes;
-                    cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
-                    cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
-
-                    vector<ElemType> vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
-                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g  EvalErr Per Sample = %.8g\n",
-                            i + 1, vScore[0], vScore[1]);
-
-                    epochCriterion = vScore[0]; //the first one is the training criterion.
-                }
-#ifdef MPI_SUPPORT
-                // ensure all processes have the same epochCriterion
-                MPI_Bcast(&epochCriterion, 1, sizeof(epochCriterion) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD);
-#endif
-
-                bool loadedPrevModel = false;
-                size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
-                if (avgCriterion == std::numeric_limits<ElemType>::infinity())
-                    avgCriterion = epochCriterion;
-                else
-                    avgCriterion = ((epochsSinceLastLearnRateAdjust -1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
-
-                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
-                {
-                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<ElemType>::infinity())
-                    {
-                        if (m_loadBestModel)
-                        {
-                            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i-1), m_validateAfterModelReloading);
-                            net.ResetEvalTimeStamp();
-                            LoadCheckPointInfo(i-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);  
-                            fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
-                            loadedPrevModel = true;
-                        }
-                    }
-
-                    if(m_continueReduce)
-                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
-                        {
-                            if(learnRateReduced == false) 
-                            {
-                                learnRateReduced = true;                                
-                            }
-                            else 
-                            {
-                                if ( myRank == 0 )
-                                    net.SaveToFile(GetModelNameForEpoch(i, true));
-                                fprintf(stderr, "Finished training and saved final model\n\n");
-                                break;
-                            }
-                        }
-                        if(learnRateReduced) 
-                        {
-                            learnRatePerSample *= m_learnRateDecreaseFactor;
-                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
-                        }
-                    }
-                    else 
-                    {
-                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
-                        {
-
-                            learnRatePerSample *= m_learnRateDecreaseFactor;
-                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
-                        }
-                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
-                        {
-                            learnRatePerSample *= m_learnRateIncreaseFactor;
-                            fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
-                        }
-                    }
-                }
-
-                if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)  //not loading previous values then set them
-                {
-                    prevCriterion = avgCriterion;
-                    epochsNotCountedInAvgCriterion = 0;
-                }
-
-                //persist model and check-point info
-                if (0 == myRank)
-                {
-                    net.SaveToFile(GetModelNameForEpoch(i));
-                    SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);
-                    if (!m_keepCheckPointFiles)
-                        _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());  //delete previous checkpiont file to save space
-                }
-
-                if (learnRatePerSample < 1e-12)
-                    fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample);
-            }
-
-            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //since we linked feature nodes. we need to remove it from the deletion
-            {
-                for (size_t i=0; i<refFeatureNodes.size(); i++)
-                {
-                    refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]); //note we need to handle deletion carefully
-                }
-            }
-
-            if (inputMatrices[L"classinfo"])
-            {
-                delete inputMatrices[L"classinfo"];
-                inputMatrices.erase(L"classinfo");
-            }
-            if (inputMatrices[L"idx2cls"])
-            {
-                delete inputMatrices[L"idx2cls"];
-                inputMatrices.erase(L"idx2cls");
-            }
-
-        }
-
-    protected:
-
-        //return true if precomputation is executed.
-        bool PreCompute(ComputationNetwork<ElemType>& net,
-            IDataReader<ElemType>* trainSetDataReader, 
-            std::vector<ComputationNodePtr>& FeatureNodes,
-            std::vector<ComputationNodePtr>& labelNodes,
-            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices)
-        {
-            std::list<ComputationNodePtr> nodes = net.GetNodesRequirePreComputation();
-
-            if (nodes.size() == 0)
-            {
-                fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
-                return false;
-            }
-
-            fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size());
-            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-            {
-                PreComputedNode<ElemType>* node = static_cast<PreComputedNode<ElemType>*> (*nodeIter);
-                fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
-            }
-
-            //compute
-            //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize); 
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
-
-            while (trainSetDataReader->GetMinibatch(inputMatrices))
-            {
-                UpdateEvalTimeStamps(FeatureNodes);
-                UpdateEvalTimeStamps(labelNodes);
-
-                size_t actualMBSize = net.GetActualMBSize();
-                net.SetActualMiniBatchSize(actualMBSize);
-                net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
-                trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd);
-
-                for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-                {
-                    net.Evaluate( *nodeIter);
-                }
-            }
-
-            //mark done
-            for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-            {
-                PreComputedNode<ElemType>* node = static_cast<PreComputedNode<ElemType>*> (*nodeIter);
-                node->MarkComputed(true);
-            }
-
-            return true;
-        }
-
-        //return a reasonable initial learning rate based on the initial mbsize
-        ElemType SearchLearnRateBeforeEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
-            const int epochNumber, const ElemType curLearnRate, 
-            IDataReader<ElemType>* trainSetDataReader, 
-            const std::vector<ComputationNodePtr>& FeatureNodes,
-            const std::vector<ComputationNodePtr>& labelNodes,
-            const std::vector<ComputationNodePtr>& criterionNodes,
-            const std::vector<ComputationNodePtr>& evaluationNodes,
-            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
-            const std::list<ComputationNodePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradients, const bool /*learnRateInitialized*/, const ElemType largestPrevLearnRatePerSample)
-        {
-            ElemType epochCriterion = std::numeric_limits<ElemType>::infinity(), prevCriterion = std::numeric_limits<ElemType>::infinity();
-            vector<ElemType> epochEvalErrors(evaluationNodes.size(),std::numeric_limits<ElemType>::infinity());
-            //ElemType epochEvalError = std::numeric_limits<ElemType>::infinity();
-            size_t totalSamplesSeen = 0;
-            ElemType bestLearnRatePerSample = curLearnRate;
-
-            size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
-            if (m_epochSize != requestDataSize)
-            {
-                epochSize = min(epochSize, m_epochSize);  //use a small number minibatches to make decision
-            }
-
-            ElemType baseCriterion;
-
-            ElemType minLearnRate = m_minLearnRate * 0.3f;
-            ElemType learnRatePerSample = 1.0f / 8.0f / 0.618f /sqrt((ElemType)m_mbSize[epochNumber]);
-
-            if (largestPrevLearnRatePerSample != std::numeric_limits<ElemType>::infinity())
-                learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;  //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
-
-            int baseModelEpoch =  epochNumber-1;
-            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
-            net.ResetEvalTimeStamp();
-
-            ElemType learnRate =learnRatePerSample;
-            LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);  
-
-            //if model is not changed this is what we will get
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, 0,
-                FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
-                smoothedGradients, baseCriterion, epochEvalErrors, totalSamplesSeen);
-
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
-            {
-                if (prevCriterion == std::numeric_limits<ElemType>::infinity())
-                    prevCriterion = baseCriterion;
-                ElemType ratio = 0.3f;
-                if (m_epochSize != requestDataSize)
-                {
-                    ratio = pow(((ElemType)epochSize) / m_epochSize, 1.0f/2);
-                }
-                baseCriterion = max(ratio * prevCriterion + (1-ratio) * baseCriterion, baseCriterion);
-            }
-
-            do
-            {
-                learnRatePerSample *= 0.618f;
-                TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,
-                    FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
-                    smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen);
-
-            } while (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate);
-
-            bestLearnRatePerSample =  learnRatePerSample;
-
-            if (epochNumber < m_numBestSearchEpoch) //grid search for the first m_numBestSearchEpoch  epochs
-            {
-                ElemType leftLearnRatePerSample = 0.01f / m_mbSize[epochNumber], rightLearnRatePerSample = learnRatePerSample;
-                ElemType leftCriterion, rightCriterion = epochCriterion;
-
-                TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample,
-                    FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
-                    smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen);
-
-                while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f)
-                {   
-                    if (rightCriterion > leftCriterion)
-                    {
-                        rightLearnRatePerSample *= 0.618f;
-
-                        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, rightLearnRatePerSample,
-                            FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
-                            smoothedGradients, rightCriterion, epochEvalErrors, totalSamplesSeen);
-                    }
-                    else
-                    {
-                        leftLearnRatePerSample /= 0.618f;
-
-                        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample,
-                            FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
-                            smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen);
-                    }
-                }
-
-                bestLearnRatePerSample =  (leftCriterion < rightCriterion)? leftLearnRatePerSample : rightLearnRatePerSample;
-            }
-
-            fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n", epochNumber+1, bestLearnRatePerSample, baseCriterion);
-
-            return bestLearnRatePerSample;
-        }
-
-        void TrainOneMiniEpochAndReloadModel(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
-            const int epochNumber,const  size_t epochSize, IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
-            const std::vector<ComputationNodePtr>& FeatureNodes,
-            const std::vector<ComputationNodePtr>& labelNodes,
-            const std::vector<ComputationNodePtr>& criterionNodes,
-            const std::vector<ComputationNodePtr>& evaluationNodes,
-            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
-            const std::list<ComputationNodePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradients,
-            ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors, size_t& totalSamplesSeen)
-        {
-            TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes,
-                criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients,
-                epochCriterion, epochEvalErrors, totalSamplesSeen); 
-            fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g    ", epochCriterion);
-            if (epochEvalErrors.size()==1)
-                fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g\n", epochEvalErrors[0], learnRatePerSample);
-            else
-            {
-                fprintf(stderr, "EvalErr Per Sample ");
-                for (size_t i=0; i<epochEvalErrors.size(); i++)
-                    fprintf(stderr, "[%lu] = %.8g ", i, epochEvalErrors[i]);
-                fprintf(stderr, "Ave Learn Rate Per Sample = %.10g\n",learnRatePerSample);
-            }
-
-            int baseModelEpoch =  epochNumber-1;
-            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
-            net.ResetEvalTimeStamp();
-
-            ElemType learnRate;
-            ElemType prevCriterion;
-            LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);  
-        }
-
-        size_t TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
-            const int epochNumber, const size_t epochSize, 
-            IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
-            const std::vector<ComputationNodePtr>& FeatureNodes,
-            const std::vector<ComputationNodePtr>& labelNodes,
-            const std::vector<ComputationNodePtr>& criterionNodes,
-            const std::vector<ComputationNodePtr>& evaluationNodes,
-            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
-            const std::list<ComputationNodePtr>& learnableNodes,
-            std::list<Matrix<ElemType>>& smoothedGradients,
-            ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors, size_t& totalSamplesSeen)
-        {
-            ElemType readTimeInMBs = 0, ComputeTimeInMBs = 0, epochCriterionLastMBs = 0;
-            int numSamplesLastMBs = 0;
-            std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(),0);
-            PTaskGraphBuilder<ElemType>* ptaskGraphBuilder = NULL;
-            
-            clock_t startReadMBTime = 0, startComputeMBTime=0;
-            clock_t endReadMBTime=0, endComputeMBTime=0; 
-
-            //initialize statistics
-            size_t totalEpochSamples = 0;
-
-            int numMBsRun = 0;
-            bool beginEpoch = true;
-
-            size_t numEvalNodes = epochEvalErrors.size();
-
-            // NOTE: the following two local matrices are not used in PTask path
-            Matrix<ElemType> localEpochCriterion(1,1,net.GetDeviceID()); //assume only one training criterion node for each epoch
-            Matrix<ElemType> localEpochEvalErrors(1,numEvalNodes,net.GetDeviceID());
-
-            localEpochCriterion.SetValue(0);
-            localEpochEvalErrors.SetValue(0);
-
-            if (m_usePtask)
-            {
-                epochCriterion = ElemType(0.0);
-                epochEvalErrors.assign(numEvalNodes, ElemType(0.0));
-            }
-
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize);
-
-            // build the PTask graph if they want to use ptask
-            // NOTE: the graph is currently only for training, so other operations will still use the usual method, 
-            // (i.e rate adjustment, regularization and other custom operations still use the non PTask method)
-            if (m_usePtask)
-            {
-                ptaskGraphBuilder = net.GetPTaskGraphBuilder();
-                ptaskGraphBuilder->UpdateParameters(this, learnRatePerSample, m_mbSize[epochNumber]);
-                ptaskGraphBuilder->StartPTaskGraph();
-
-                // currently CNTK likes to keep things on the GPU, and PTask expects things to be on the CPU, so tell CNTK to keep data on the CPU
-                for (std::pair<std::wstring, Matrix<ElemType>*> inpair : inputMatrices)
-                {
-                    Matrix<ElemType>* mat = inpair.second;
-                    mat->SetPreferredDeviceId(CPUDEVICE);
-                    mat->TransferFromDeviceToDevice(mat->GetDeviceId(), CPUDEVICE, true);
-                }
-            }
-            
-            startReadMBTime=clock();
-            while (trainSetDataReader->GetMinibatch(inputMatrices))
-            {
-#ifdef MPI_SUPPORT
-                DecimateMinibatch(inputMatrices);
-#endif
-                endReadMBTime=clock();
-                startComputeMBTime=clock();
-
-                UpdateEvalTimeStamps(FeatureNodes);
-                UpdateEvalTimeStamps(labelNodes);
-
-                size_t actualMBSize = net.GetActualMBSize();
-                if (0 == actualMBSize)
-                    continue;
-
-                net.SetActualMiniBatchSize(actualMBSize);
-                net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
-                trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); 
-
-#ifndef EVALDLL
-                if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
-                {
-                     throw std::logic_error("cannot pass gradient checker");
-                }
-#endif
-                if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //TODO: currently only support one node regularization
-                {
-                    refNet.SetActualMiniBatchSize(actualMBSize);
-                    refNet.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
-                    refNet.Evaluate(refNode);
-                    Matrix<ElemType>::ScaleAndAdd(m_adaptationRegWeight, refNode->FunctionValues(), 1-m_adaptationRegWeight, labelNodes[0]->FunctionValues()); 
-                }
-                   
-                if (m_usePtask)
-                {
-                    // Pushing data in the graph starts things going
-                    bool endOfEpoch = trainSetDataReader->DataEnd(endDataEpoch);
-                    CONTROLSIGNAL signal = beginEpoch?DBCTLC_BOF:DBCTLC_NONE;
-                    if (endOfEpoch)
-                        signal |= DBCTLC_EOF;
-
-                    ptaskGraphBuilder->PushData(inputMatrices, signal);
-                    ptaskGraphBuilder->PushActualMBSize(learnableNodes, net.GetActualMBSize(), signal);
-                    beginEpoch = false; // clear this out after first epoch
-
-                    // pull the values from the graph for the totals
-                    epochCriterion += ptaskGraphBuilder->GetValue(criterionNodes[0]);
-                    for (size_t i=0; i<numEvalNodes; i++)
-                    {
-                        epochEvalErrors[i] += ptaskGraphBuilder->GetValue(evaluationNodes[i]);
-                    }
-
-                    // NOTE: update model parameters is part of the graph, so nothing to do here
-                }
-                else
-                {
-                    if (learnRatePerSample > m_minLearnRate * 0.01)  //only compute gradient when learning rate is large enough
-                        net.ComputeGradient(criterionNodes[0]);  //use only the first criterion. Is there any possibility to use more?
-                    else
-                        net.Evaluate(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more?
-
-                    Matrix<ElemType>::AddElementToElement(criterionNodes[0]->FunctionValues(), 0, 0, localEpochCriterion, 0, 0);
-
-                    std::vector<ElemType>mbEvalErrors(numEvalNodes,0);
-                    for (size_t i=0; i<numEvalNodes; i++)
-                    {
-                        net.Evaluate(evaluationNodes[i]);
-                        Matrix<ElemType>::AddElementToElement(evaluationNodes[i]->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i);
-                    }
-
-                    //update model parameters
-                    if (learnRatePerSample > m_minLearnRate * 0.01)
-                    {
-                        auto smoothedGradientIter=smoothedGradients.begin();
-                        for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
-                        {
-                            ComputationNodePtr node = (*nodeIter);
-                            Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
-
-                            UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber]);
-                        }                    
-                    }
-                }
-
-
-                endComputeMBTime=clock();
-                numMBsRun ++;
-                if (m_traceLevel > 0)
-                {
-                    ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(CLOCKS_PER_SEC);
-                    ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/CLOCKS_PER_SEC;
-
-                    readTimeInMBs += MBReadTime;
-                    ComputeTimeInMBs += MBComputeTime;
-                    numSamplesLastMBs += int(actualMBSize);
-
-                    if (numMBsRun % m_numMBsToShowResult == 0)
-                    {
-                        if (!m_usePtask)
-                        {   // get the epoch Values updated, in PTask don't use the loclEpoch* temporary matrices
-                            epochCriterion = localEpochCriterion.Get00Element();
-                            for (size_t i=0; i< numEvalNodes; i++)
-                                epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
-                        }
-
-                        fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d    Train Loss Per Sample = %.8g    ",epochNumber+1, numMBsRun-m_numMBsToShowResult+1, numMBsRun, numSamplesLastMBs,
-                            (epochCriterion-epochCriterionLastMBs)/numSamplesLastMBs);
-                        for (size_t i=0; i<numEvalNodes; i++){
-                            fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g    ",i,(epochEvalErrors[i]-epochEvalErrorsLastMBs[i])/numSamplesLastMBs);
-                        }
-                        fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n", readTimeInMBs, ComputeTimeInMBs, (readTimeInMBs + ComputeTimeInMBs)/numSamplesLastMBs);
-                                                    
-                        //reset statistics
-                        readTimeInMBs = ComputeTimeInMBs = 0;
-                        numSamplesLastMBs = 0; 
-
-                        epochCriterionLastMBs = epochCriterion;
-                        for (size_t i=0; i< numEvalNodes; i++)
-                            epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
-                    }
-                }
-                startReadMBTime=clock();
-                totalEpochSamples += actualMBSize;
-                totalSamplesSeen += actualMBSize;
-
-                if (totalEpochSamples >= epochSize)
-                    break;
-
-                /// call DataEnd function 
-                /// DataEnd does reader specific process if sentence ending is reached
-                trainSetDataReader->DataEnd(endDataSentence);
-
-            }
-
-            if (m_usePtask)
-            {
-                // when the epoch is complete, we need to transfer all the values back to the LearnableNodes, which will be saved off as the model
-                std::list<ComputationNodePtr> learnableNodes = net.LearnableNodes(criterionNodes[0]);
-                for (ComputationNodePtr node : learnableNodes)
-                {
-                    ptaskGraphBuilder->GetValue(node, node->FunctionValues());
-                }
-                epochCriterion /= float(totalEpochSamples);
-                for (size_t i=0; i< numEvalNodes; i++)
-                {
-                    epochEvalErrors[i] /= float(totalEpochSamples);
-                }
-            }
-            else
-            {
-                localEpochCriterion /= float(totalEpochSamples);
-                localEpochEvalErrors /= float(totalEpochSamples);
-
-                epochCriterion = localEpochCriterion.Get00Element();
-                for (size_t i=0; i< numEvalNodes; i++)
-                {
-                    epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
-                }
-            }
-            return totalEpochSamples;
-        }
-public:
-        // UpdateWeightsS - static version of UpdateWeights()
-        static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize)
-        {
-#if DUMPOUTPUT
-            fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize);
-            fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB());
-            gradientValues.Print("Gradient Input");
-            smoothedGradient.Print("Smoothed Gradient Input");
-#endif
-
-            // make actualMBSize is a valid value
-            assert(actualMBSize > 0);
-
-            //clipping gradients to prevent outliers
-            sgd->ClipGradient(gradientValues, actualMBSize);
-
-            GradientsUpdateType adpType = sgd->GradUpdateType();
-            ElemType noiseStd = sgd->GradientUpdateNoiseStd();
-            Matrix<ElemType> sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId());
-            if (noiseStd > 0)
-            {
-                sgdUpdateNoise.SetValue(gradientValues);  /// get the gradient structure since gradient is sparse
-                sgdUpdateNoise.SetGaussianRandomValue(0, noiseStd); // reset its value to random 
-            }
-
-            if (adpType == GradientsUpdateType::None)
-            {
-                ElemType momentum = sgd->MomentumPerMB(); 
-                if (actualMBSize < expectedMBSize && momentum > 0.0000001f)  //we use simple linear (instead of log linear) scaling here
-                {
-                    momentum = (ElemType) exp (log(momentum)/expectedMBSize * actualMBSize);
-                }
-                smoothedGradient.NormalGrad(gradientValues, functionValues, learnRatePerSample, momentum);
-            }
-            if (adpType == GradientsUpdateType::AdaGrad)
-            {
-                smoothedGradient.Adagrad(gradientValues);
-                Matrix<ElemType>::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues);
-            }
-            if (adpType == GradientsUpdateType::RmsProp)
-            {
-                // include L2 regularizer
-                Matrix<ElemType>::ScaleAndAdd((ElemType)0.001, functionValues, gradientValues);
-                smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma, (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max, (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min);
-                Matrix<ElemType>::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues);
-            }
-
-            if (noiseStd > 0)
-            {
-                Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
-            }
-#if DUMPOUTPUT
-            functionValues.Print("Parameter Update");
-#endif
-        }
-protected:
-        // UpdateWeights - update the weights in 
-        void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize) const
-        {
-#if DUMPOUTPUT
-            fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
-#endif
-            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize);
-            node->UpdateEvalTimeStamp();
-        }
-
-        void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
-        {
-            if (m_clippingThresholdPerSample != std::numeric_limits<ElemType>::infinity())
-            {
-                ElemType maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
-                if (m_gradientClippingWithTruncation)
-                {
-                    gradient.InplaceTruncate(maxGradientPerMB);
-                }
-                else //norm2 normalized
-                {
-                    ElemType gradientNorm = gradient.FrobeniusNorm();
-                    if (gradientNorm > maxGradientPerMB)
-                    {
-                        ElemType normFactor =  maxGradientPerMB / gradientNorm;
-                        gradient *= normFactor;
-                    }
-                }
-            }
-        }
-
-        void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, 
-            const std::list<Matrix<ElemType>>& smoothedGradients, const ElemType prevCriterion)
-        {
-            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
-
-            File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
-            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
-
-            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-            fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
-            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
-
-            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
-
-            for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-            {
-                const Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
-                fstream << smoothedGradient;
-            }
-            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
-
-            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
-        }
-
-        bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, ElemType& learnRatePerSample, 
-            std::list<Matrix<ElemType>>& smoothedGradients, ElemType& prevCriterion)
-        {
-            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
-            if (!fexists(checkPointFileName.c_str()) )
-            {
-                fprintf(stderr, "Warning: checkpiont file is missing. learning parameters will be initialized from 0\n");
-                return false;
-            }
-
-            File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
-            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
-
-            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-            fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
-
-            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
-
-            for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-            {
-                Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
-                fstream >> smoothedGradient;
-            }
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
-
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
-
-            return true;
-        }
-
-        wstring GetCheckPointFileNameForEpoch (const int epoch)
-        {
-            return GetModelNameForEpoch (epoch) + L".ckp";
-        }
-
-        wstring GetModelNameForEpoch (const int epoch, bool bLastModel = false)
-        {
-            int epoch1Base = epoch + 1;
-            if (epoch1Base == m_maxEpochs || bLastModel) 
-                return m_modelPath;          
-            else 
-                return msra::strfun::wstrprintf (L"%s.%d", m_modelPath.c_str(), (int) epoch1Base);
-        } 
-
-        //return -1 if nothing exists
-        int DetermineStartEpoch (const bool makeMode)
-        {
-            if (!makeMode)
-                return -1;  //always start from scratch
-
-            int firstEpoch = -1;
-
-            wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs)-1);
-            for (int e = int(m_maxEpochs)-1; e >= -1; e--)
-            {
-                const wstring prevEpochFile = GetModelNameForEpoch (e-1);
-
-                if (msra::files::fuptodate (curEpochFile, prevEpochFile, false))
-                {
-                    firstEpoch = size_t(e)+1;
-                    break;
-                }
-                else
-                    curEpochFile = prevEpochFile;
-            }
-
-            return firstEpoch;
-        }
-
-        AdaptationRegType ParseAdaptationRegType(wstring s)
-        {
-            msra::strfun::tolower_ascii(s);
-            if (s == L"" || s == L"none")
-                return AdaptationRegType::None;
-            else if (s == L"kl" || s == L"klreg" )
-                return AdaptationRegType::KL;
-            else
-                throw std::invalid_argument(
-                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
-                "(None | KL)");
-        }
-
-        GradientsUpdateType ParseGradUpdateType(wstring s)
-        {
-            msra::strfun::tolower_ascii(s);
-            if (s == L"" || s == L"none")
-                return GradientsUpdateType::None;
-            else if (s == L"adagrad")
-                return GradientsUpdateType::AdaGrad;
-            else if (s == L"rmsprop")
-                return GradientsUpdateType::RmsProp;
-            else
-                throw std::invalid_argument(
-                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
-                "(None | AdaGrad | RmsProp )");
-        }
-
-        LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
-        {
-            msra::strfun::tolower_ascii(s);
-            if (s == L"false" || s == L"none")
-                return LearningRateSearchAlgorithm::None;
-            else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
-                return LearningRateSearchAlgorithm::SearchBeforeEpoch;
-            else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
-                return LearningRateSearchAlgorithm::AdjustAfterEpoch;
-            else
-                throw std::invalid_argument(
-                "autoAdjustLR: Invalid learning rate search type. Valid values are "
-                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
-        }
-
-        GradientsUpdateType GradUpdateType() const {return m_gradType.mType;}
-        ElemType GradientUpdateNoiseStd() const {return m_gradType.mGaussianNoiseInjectStd;}
-        ElemType MomentumPerMB() const {return m_momentumPerMB;}
-
-    public:
-        #define EPSILON 1e-5
-
-        bool GradientCheck(
-            ComputationNetwork<ElemType>& net,
-            const std::vector<ComputationNodePtr>& criterionNodes,
-            const std::list<ComputationNodePtr>& learnableNodes,
-            int npos)
-        {
-            // gradient checking
-            for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-            {
-                ComputationNodePtr node = (*nodeIter);
-
-                int irow = (int)fmod(rand(), node->FunctionValues().GetNumRows()-1);
-                int icol = (int)fmod(rand(), node->FunctionValues().GetNumCols()-1);
-                irow = max(0, irow);
-                icol = max(0, icol);
-
-                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
-                // node->FunctionValues().Print();
-                ElemType eOrg = node->FunctionValues()(irow,icol);
-
-                node->UpdateEvalTimeStamp();
-                net.ComputeGradient(criterionNodes[npos]);  //use only the first criterion. Is 
-                //ElemType mbEvalCri =
-                criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-                ElemType eGradErr = node->GradientValues()(irow, icol); 
-
-                ElemType ePos = eOrg + ElemType(EPSILON);
-                ElemType eNeg = eOrg - ElemType(EPSILON);
-
-                node->FunctionValues()(irow, icol) = ePos;
-                node->UpdateEvalTimeStamp();
-                net.Evaluate(criterionNodes[npos]); 
-                ElemType mbEvalCriPos = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-                
-                node->FunctionValues()(irow, icol) = eNeg;
-                node->UpdateEvalTimeStamp();
-                net.Evaluate(criterionNodes[npos]); 
-                ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-
-                // back to its orginal parameter value
-                node->FunctionValues()(irow, icol) = eOrg; 
-
-                // check if they are consistent
-                ElemType eGradNum = (ElemType)((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
-                ElemType threshold = (ElemType)pow((ElemType)10.0, max((ElemType)0.0, ceil(log10(min(fabs(eGradErr), fabs(eGradNum))))) - (int)m_gradientCheckSigDigit);
-                ElemType diff = (ElemType)fabs(eGradErr - eGradNum);
-                bool wrong = (std::isnan(diff) || diff > threshold);
-                if (wrong)
-                {
-                    fprintf (stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n", node->NodeName().c_str(), eGradNum, eGradErr);
-                    return false; 
-                }
-            }
-
-            return true;
-        }
-
-        void SetOtherInfo(ComputationNetwork<ElemType>& net , IDataReader<ElemType>* /*trainSetDataReader*/, IDataReader<ElemType>* /*validSetDataReader*/, std::map<std::wstring, Matrix<ElemType>*>& inputMatrices)
-        {
-            std::vector<ComputationNodePtr> criterionNodes = net.FinalCriterionNodes();
-            std::vector<ComputationNodePtr> evaluationNodes = net.EvaluationNodes();
-
-            //initializing weights and gradient holder
-            for (size_t i = 0; i < criterionNodes.size(); i++)
-            {
-                if (criterionNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
-                {
-                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) criterionNodes[i];
-                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
-                }
-            }
-
-            for (size_t i=0;i<evaluationNodes.size(); i++)
-            {
-                if (evaluationNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
-                {
-                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evaluationNodes[i];
-                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
-                }
-            }
-        }
-
-    protected:
-
-        floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside
-        intargvector m_mbSize;
-        size_t m_epochSize;
-        size_t m_maxEpochs;
-        floatargvector m_momentumInputPerMB;
-        ElemType m_momentumPerMB;
-        bool m_gradientClippingWithTruncation;
-        ElemType m_clippingThresholdPerSample;
-
-        wstring m_modelPath;
-        wstring m_trainCriterionNodeName;
-        wstring m_evalCriterionNodeName;
-
-        intargvector m_numMiniBatch4LRSearch;
-        size_t m_numBestSearchEpoch;
-
-        LearningRateSearchAlgorithm m_autoLearnRateSearchType;
-
-        AdaptationRegType m_adaptationRegType;
-        ElemType m_adaptationRegWeight;
-        bool m_needRegularization;
-
-        bool m_loadBestModel;
-        ElemType m_reduceLearnRateIfImproveLessThan;
-        bool m_continueReduce;
-        size_t m_learnRateAdjustInterval; //determine after how many epochs the learning rate should be auto adjusted.
-        ElemType m_increaseLearnRateIfImproveMoreThan;
-        ElemType m_learnRateIncreaseFactor;
-        ElemType m_learnRateDecreaseFactor;
-
-        floatargvector m_dropoutRates;
-        size_t m_maxTempMemSizeInSamplesForCNN;
-
-        int m_traceLevel;
-
-        size_t m_numPrevLearnRates;
-
-        ElemType m_minLearnRate;
-
-        GradientUpdateInfo m_gradType;
-        RMSPropInfo m_rpi;
-
-        bool m_usePtask;
-
-        bool m_keepCheckPointFiles;
-
-        int m_numMBsToShowResult;
-
-        bool m_doGradientCheck;
-        ElemType m_gradientCheckSigDigit;
-
-        bool m_validateAfterModelReloading;
-    };
-    template class SGD<float>; 
-    template class SGD<double>;
-
-}}}
+//
+// <copyright file="SGD.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include "basetypes.h"
+#include "ComputationNetwork.h"
+#include "ComputationNetworkHelper.h"
+#include "SimpleEvaluator.h"
+#include "DataReader.h"
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include "fileutil.h"
+#include "commandArgUtil.h"
+#include <chrono> 
+#include <random>
+#include "TimerUtility.h"
+
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+extern int myRank;
+extern int numProcs;
+
+using namespace std;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    template<class ElemType>
+    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb)
+    {
+        size_t rv = 0;
+        if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it)
+        {
+            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+            size_t nCols = mat.GetNumCols();
+            size_t col_start = (nCols * myRank) / numProcs;
+            size_t col_end = (nCols*(myRank + 1)) / numProcs;
+            if (col_end > nCols) col_end = nCols; // this shouldn't happen
+            if (col_end == col_start)
+            {
+                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
+                mat.SetValue(tmp);
+            }
+            else
+            {
+                MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
+                mat.SetValue(tmp);
+            }
+            if (0 == rv)
+            {
+                rv = mat.GetNumCols();
+            }
+            else
+            {
+                if (rv != mat.GetNumCols())
+                    throw std::logic_error("Uneven number of columns among inputs.");
+            }
+        }
+    }
+
+    enum class LearningRateSearchAlgorithm : int
+    {
+        None,
+        AdjustAfterEpoch,
+        SearchBeforeEpoch
+    };
+
+    enum class AdaptationRegType : int
+    {
+        None,
+        KL
+    };
+
+    enum class GradientsUpdateType : int 
+    {
+        None,
+        AdaGrad,
+        RmsProp
+    };
+    
+    // configuration parameters associated with RMSProp learning algorithm
+    typedef struct stRMSPropInfo{
+        double gamma;
+        double inc;
+        double dec;
+        double max;
+        double min;
+        stRMSPropInfo()
+        {
+            gamma = 0.99;
+            inc = 1.2;
+            dec = 0.75;
+            max = 10.0;
+            min = 0.1;
+        }
+    }RMSPropInfo;
+
+    typedef struct stGradientUpdateInfo{
+        GradientsUpdateType mType;
+        float mGaussianNoiseInjectStd;
+        stGradientUpdateInfo()
+        {
+            mType = GradientsUpdateType::AdaGrad;
+            mGaussianNoiseInjectStd = 0.0075f;
+        }
+    } GradientUpdateInfo;
+
+    template<class ElemType>
+    class SGD : ComputationNetworkHelper<ElemType>
+    {
+    protected:
+        typedef ComputationNetworkHelper<ElemType> B;
+        using B::SetMaxTempMemSizeForCNN; using B::SetDropoutRate; using B::UpdateEvalTimeStamps;
+        typedef ComputationNode<ElemType>* ComputationNodePtr;
+        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
+
+    public:
+        SGD(const ConfigParameters& configSGD)
+        {
+            ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
+			m_needToNormalizeLRByParallUtterance = false;
+            floatargvector learningRatesPerMB = learningRatesPerMBStr;
+
+            ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
+            floatargvector learningRatesPerSample = learningRatesPerSampleStr;
+
+            std::string executionEngineValue = configSGD("executionEngine", "synchronous");
+
+#ifdef USE_PTASK
+            // use PTask if we have more than one GPU or the MultiGPU flag is set
+            bool usePtask = (g_bestGpu != NULL && g_bestGpu->UseMultiple()) || (bool)configSGD("MultiGPU", "false");
+#else
+            bool usePtask = false;
+#endif
+            // AutoAdjust Parameters
+            ConfigParameters configAALR (configSGD("AutoAdjust",""));
+            LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None"));
+            ElemType reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0");
+            bool continueReduce = (bool)configAALR("continueReduce", "false");
+            size_t learnRateAdjustInterval = (size_t)configAALR("learnRateAdjustInterval", "1");
+            ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618");
+            ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");// std::numeric_limits<ElemType>::infinity());
+            ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382");
+            ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500");
+            intargvector numMiniBatch4LRSearch = minibatch4LRSearch;
+            size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5");
+            size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1");
+            bool loadBestModel = configAALR("loadBestModel", "true");
+
+            ConfigArray minibatchSize = configSGD("minibatchSize", "256");
+            intargvector mbSize = minibatchSize;
+            size_t epochSize = configSGD("epochSize", "0");
+
+            size_t maxEpochs = configSGD("maxEpochs");
+            ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
+            floatargvector momentumPerMB = momentumPerMBStr;
+
+            wstring modelPath = configSGD("modelPath");
+            wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
+            wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
+
+            size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0");
+
+            int traceLevel = configSGD("traceLevel", "0");
+            size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10");
+
+            bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false");
+
+            bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true");
+            ElemType clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF"); // std::numeric_limits<ElemType>::infinity());
+
+            ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0");
+            floatargvector dropoutRates = dropoutRatesStr;
+
+            GradientUpdateInfo gUpdateInfo; 
+            GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None"));
+            ElemType gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0");
+            gUpdateInfo.mType = gradUpdateType;
+            gUpdateInfo.mGaussianNoiseInjectStd = (float)gaussianNoiseInjecStd;
+            
+            // extract RMSProp parameters from config, if they exist. Default to reasonable values.
+            RMSPropInfo rpi;
+            rpi.dec = (double)configSGD("rms_wgt_dec", "0.75");
+            rpi.inc = (double)configSGD("rms_wgt_inc", "1.2");
+            rpi.min = (double)configSGD("rms_wgt_min", "0.1");
+            rpi.max = (double)configSGD("rms_wgt_max", "10.0");
+            rpi.gamma = (double)configSGD("rms_gamma", "0.99");
+
+            /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of 
+            /// useAdagrad=true
+            bool useAdagrad = configSGD("useAdagrad", "false");
+            if (useAdagrad)
+            {
+                gradUpdateType = GradientsUpdateType::AdaGrad;
+                gUpdateInfo.mType = gradUpdateType;
+            }
+
+            AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None"));
+            ElemType adaptationRegWeight = configSGD("adaptationRegWeight", "0");
+
+            /// gradient check setup
+            bool doGradientCheck = configSGD("gradientcheck", "false");
+            ElemType gradientCheckSigDigit = configSGD("sigFigs", "6");
+
+            bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
+
+			bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
+
+            Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation, 
+                clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor, 
+                reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates,
+                loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
+                maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
+                trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
+                rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode);
+        }
+    
+        void setMomentum(float momentum)
+        {
+            m_momentumPerMB = (ElemType)momentum;
+        }
+
+        //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
+        void Init(const floatargvector& learningRatesPerMB, const floatargvector& learningRatesPerSample, const intargvector& mbSize, 
+            const size_t epochSize, const size_t maxEpochs, 
+            const wstring& modelPath, const floatargvector& momentumPerMB, const bool gradientClippingWithTruncation = true,
+            const ElemType clippingThresholdPerSample=std::numeric_limits<ElemType>::infinity(),
+            const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None, 
+            const ElemType increaseLearnRateIfImproveMoreThan = std::numeric_limits<ElemType>::infinity(), const ElemType learnRateIncreaseFactor = 1.382f,
+            const ElemType reduceLearnRateIfImproveLessThan=0, const bool continueReduce=false, const ElemType learnRateDecreaseFactor = 0.618f, floatargvector dropoutRates = floatargvector(L"0.0f"),
+            const bool loadBestModel=true, const intargvector& numMiniBatch4LRSearch=intargvector(L"500"), const size_t numPrevLearnRates = 5, 
+            const size_t numBestSearchEpoch = 1, const int traceLevel = 0,
+            const size_t numMBsToShowResult = 10, const size_t maxTempMemSizeInSamplesForCNN = 0,
+            const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
+            const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
+            const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
+            RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed=true)
+        {
+            numPrevLearnRates;
+            m_mbSize=mbSize;
+            m_epochSize=epochSize;
+            if (m_epochSize == 0)
+            {
+                m_epochSize = requestDataSize;
+            }
+            m_maxEpochs=maxEpochs;
+     
+            m_gradientClippingWithTruncation=gradientClippingWithTruncation;
+            m_modelPath=modelPath;
+            m_autoLearnRateSearchType=autoLearnRateSearchType;
+            m_traceLevel=traceLevel;
+            m_loadBestModel=loadBestModel;
+            m_increaseLearnRateIfImproveMoreThan=increaseLearnRateIfImproveMoreThan;
+            m_learnRateIncreaseFactor=learnRateIncreaseFactor;
+            m_reduceLearnRateIfImproveLessThan=reduceLearnRateIfImproveLessThan;
+             m_continueReduce=continueReduce;
+             m_learnRateAdjustInterval = max(1, learnRateAdjustInterval); //minimum interval is 1 epoch
+            m_learnRateDecreaseFactor=learnRateDecreaseFactor;
+            m_clippingThresholdPerSample=abs(clippingThresholdPerSample);
+            m_numMiniBatch4LRSearch=numMiniBatch4LRSearch;
+            m_dropoutRates=dropoutRates;
+            m_numMBsToShowResult=int(numMBsToShowResult);
+            m_numBestSearchEpoch=numBestSearchEpoch;
+            m_maxTempMemSizeInSamplesForCNN=maxTempMemSizeInSamplesForCNN;
+            m_gradType = gradUpdateType;
+            m_rpi = rpi;
+            m_usePtask = usePtask;
+            m_keepCheckPointFiles = keepCheckPointFiles;
+
+            m_adaptationRegType = adaptationRegType;
+            m_adaptationRegWeight = adaptationRegWeight;
+
+            m_trainCriterionNodeName = trainCriterionNodeName;
+            m_evalCriterionNodeName = evalCriterionNodeName;
+			m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;
+
+            for (size_t i=0; i<m_mbSize.size(); i++)
+                if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
+                    throw std::invalid_argument ("epoch size must be larger than mbsize.");
+
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None && (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
+            {
+                throw std::invalid_argument ("If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter.");
+            }
+
+            if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
+            {
+                throw std::invalid_argument ("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them.");
+            }
+            else if (learningRatesPerSample.size() > 0)
+            {
+                m_learningRatesPerSample=learningRatesPerSample;
+            }
+            else if (learningRatesPerMB.size() > 0)
+            {
+                int LRSize = (int)max(learningRatesPerMB.size(), m_mbSize.size());
+                m_learningRatesPerSample.resize(LRSize);
+                for (int i=0; i<LRSize; i++)
+                {
+                    m_learningRatesPerSample[i] = learningRatesPerMB[i]/m_mbSize[i];
+                }
+				m_needToNormalizeLRByParallUtterance = true; 
+            }
+            m_momentumPerMB = 0.9f;
+            if  (momentumPerMB.size() >0)
+            {
+                m_momentumInputPerMB=momentumPerMB;
+                if (m_momentumInputPerMB[0]>=1 || m_momentumInputPerMB[0]<0)
+                    throw std::invalid_argument ("momentumPerMB must be in [0, 1).");
+            }
+
+            if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor<1)
+            {
+                throw std::invalid_argument ("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
+            }
+
+            for (size_t i=0; i<m_dropoutRates.size(); i++)
+            {
+                if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
+                {
+                    throw std::invalid_argument ("dropoutRate must be >= 0 and < 1.");
+                }
+            }
+
+            if (m_adaptationRegWeight > 1 || m_adaptationRegWeight <0)
+                throw invalid_argument("adaptationRegWeight must be in [0 1]");
+
+            m_minLearnRate = 1e-9f;
+
+            m_needRegularization = false;
+
+            m_doGradientCheck = doGradientCheck;
+            m_gradientCheckSigDigit = gradientCheckSigDigit;
+            m_validateAfterModelReloading = validateAfterModelReloading;
+
+            msra::files::make_intermediate_dirs (m_modelPath);
+        }
+
+        void Adapt(wstring origModelFileName, wstring refNodeName, IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader, const DEVICEID_TYPE deviceID, const bool makeMode = true)
+        {
+            if (origModelFileName == L"" || trainSetDataReader == nullptr)
+                    throw std::invalid_argument ("origModel and trainSetDataReader should not be null.");
+
+            int startEpoch = DetermineStartEpoch(makeMode);
+            if (startEpoch == m_maxEpochs)
+            {
+                fprintf(stderr, "Final model exists. No further training is necessary.\n");
+                return;
+            }
+
+            ComputationNetwork<ElemType> net(deviceID);
+            if (startEpoch >= 0)
+            {
+                wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1);
+                fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+                net.LoadFromFile(modelFileName);
+            }
+            else
+            {
+                fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
+                net.LoadFromFile(origModelFileName);
+            }
+
+            startEpoch = max(startEpoch, 0);
+
+            ComputationNetwork<ElemType> refNet(deviceID);
+            m_needRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
+            if (m_needRegularization)
+            {
+                fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
+                refNet.LoadFromFile(origModelFileName);
+            }
+
+            ComputationNodePtr refNode = nullptr;
+            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL)
+            {
+                fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
+                if (refNodeName == L"")
+                    throw invalid_argument("refNodeName does not exist and is needed when adaptationRegType is KL.");
+
+                refNode = refNet.GetNodeFromName(refNodeName);
+            }
+            
+            TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
+        }
+
+        void Train(IComputationNetBuilder<ElemType>* netBuilder, IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader, const bool makeMode = true)
+        {
+            if (netBuilder == nullptr || trainSetDataReader == nullptr)
+                    throw std::invalid_argument ("netBuilder and trainSetDataReader should not be null.\n");
+
+            int startEpoch = DetermineStartEpoch(makeMode);
+            if (startEpoch == m_maxEpochs)
+            {
+                fprintf(stderr, "Final model exists. No further training is necessary.\n");
+                return;
+            }
+
+            wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1);
+            if (startEpoch >= 0)
+                fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+            ComputationNetwork<ElemType>& net  = 
+                startEpoch<0? netBuilder->BuildNetworkFromDescription() : netBuilder->LoadNetworkFromFile(modelFileName);
+            // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
+            // strategy should be to run the initializer above on myRank==0, and then broadcast parameters.
+
+            startEpoch = max(startEpoch, 0);
+            m_needRegularization = false;
+
+            TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader);
+        }
+
+    protected:
+        std::vector<ComputationNodePtr>  GetTrainCriterionNodes(ComputationNetwork<ElemType>& net)
+        {
+            fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str());
+            if (!m_trainCriterionNodeName.empty())
+            {
+                std::vector<ComputationNodePtr> nodes;
+                ComputationNodePtr node = net.GetNodeFromName(m_trainCriterionNodeName);
+                net.ValidateNetwork(node);
+                if (node->FunctionValues().GetNumElements() != 1)
+                    throw invalid_argument("the trainCriterionNodeName specified in the config file is not a valid training criterion node.");
+
+                nodes.push_back(node);
+                return nodes;
+            }
+            else
+                return net.FinalCriterionNodes();
+        }
+        std::vector<ComputationNodePtr>  GetEvalCriterionNodes(ComputationNetwork<ElemType>& net)
+        {
+            fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str());
+            if (!m_evalCriterionNodeName.empty())
+            {
+                std::vector<ComputationNodePtr> nodes;
+                ComputationNodePtr node = net.GetNodeFromName(m_evalCriterionNodeName);
+                net.ValidateNetwork(node);
+                if (node->FunctionValues().GetNumElements() != 1)
+                    throw invalid_argument("the evalCriterionNodeName specified in the config file is not a valid evaluation criterion node.");
+
+                nodes.push_back(node);
+                return nodes;
+            }
+            else
+                return net.EvaluationNodes();
+        }
+
+        void TrainOrAdaptModel(int startEpoch, ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, ComputationNodePtr refNode,
+            IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader)
+        {
+            std::vector<ComputationNodePtr> & FeatureNodes = net.FeatureNodes();
+            std::vector<ComputationNodePtr> & labelNodes = net.LabelNodes();
+            std::vector<ComputationNodePtr> criterionNodes = GetTrainCriterionNodes(net);
+            std::vector<ComputationNodePtr> evaluationNodes = GetEvalCriterionNodes(net);
+
+            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
+            for (size_t i=0; i<FeatureNodes.size(); i++)
+            {
+                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
+            }
+            for (size_t i=0; i<labelNodes.size(); i++)
+            {
+                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();
+            }
+            
+            // special handling of classed based softmax node. Need a better solution to it.
+            if (criterionNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName() ||
+                evaluationNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode<ElemType>::TypeName())
+            {
+                size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
+                int deviceId = FeatureNodes[0]->FunctionValues().GetDeviceId();
+                inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, (DEVICEID_TYPE)deviceId); 
+                inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, (DEVICEID_TYPE)deviceId);
+            }
+
+
+            //used for KLD regularized adaptation. For all other adaptation techniques use MEL to edit the model and using normal training algorithm
+            std::vector<ComputationNodePtr> refFeatureNodes;
+            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+            {
+                refFeatureNodes.resize(FeatureNodes.size());
+                for (size_t i=0; i<FeatureNodes.size(); i++)
+                {
+                    refFeatureNodes[i] = refNet.GetNodeFromName(FeatureNodes[i]->NodeName()); //we need to keep this info to handle deletion
+                    refNet.ChangeNode(FeatureNodes[i]->NodeName(), FeatureNodes[i]); 
+                }
+
+                refNet.RebuildNetwork(refNode);
+            }
+
+            //initializing weights and gradient holder
+            std::list<ComputationNodePtr>& learnableNodes = net.LearnableNodes(criterionNodes[0]);  //only one criterion so far TODO: support multiple ones?
+            std::list<Matrix<ElemType>> smoothedGradients;
+
+            for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+            {
+                ComputationNodePtr node = (*nodeIter);
+                smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(),net.GetDeviceID()));
+            }
+
+            ElemType epochCriterion, avgCriterion, prevCriterion;
+            epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<ElemType>::infinity();
+            size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
+
+            std::vector<ElemType> epochEvalErrors(evaluationNodes.size(),std::numeric_limits<ElemType>::infinity());
+            
+            std::vector<wstring> evalNodeNames;
+            for (size_t i=0;i<evaluationNodes.size(); i++)
+                evalNodeNames.push_back(evaluationNodes[i]->NodeName());
+
+            size_t totalSamplesSeen = 0;
+            ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
+
+            int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
+            vector<ElemType> prevLearnRates;
+            prevLearnRates.resize(m_numPrevLearnRates);
+            for (int i=0; i<m_numPrevLearnRates; i++)
+                prevLearnRates[i] = std::numeric_limits<ElemType>::infinity();
+
+            //precompute mean and invStdDev nodes and save initial model
+            if (PreCompute(net, trainSetDataReader, FeatureNodes, labelNodes, inputMatrices) || startEpoch == 0)
+                if (0 == myRank) // only needs to be done by one process
+                    net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
+
+			// first, we need to normalize the effect of nbruttsineachrecurrentiter
+			if (trainSetDataReader->NumberSlicesInEachRecurrentIter()>1 && m_needToNormalizeLRByParallUtterance)
+			{
+				for (auto & x : m_learningRatesPerSample)
+				{
+					x /= trainSetDataReader->NumberSlicesInEachRecurrentIter();
+				}
+			}
+            bool learnRateInitialized = false;
+            if (startEpoch > 0)
+            {
+                learnRateInitialized = LoadCheckPointInfo(startEpoch-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);  
+                setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size()-1]);
+            }
+
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
+                throw std::invalid_argument ("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");
+
+            unsigned long dropOutSeed = 1;
+            ElemType prevDropoutRate = 0;
+
+            bool learnRateReduced = false;
+
+            SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
+            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) 
+                SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
+
+            // build the PTask graph if they want to use ptask
+            // NOTE: the graph is currently only for training, so other operations will still use the usual method, 
+            // (i.e rate adjustment and other custom operations still use the non PTask method)
+            if (m_usePtask)
+            {
+                // set the minibatch size to the largest thing we will ever see
+                int maxMbSize = 0;
+                for (int val : m_mbSize)
+                {
+                    maxMbSize = max(val, maxMbSize);
+                }
+                net.SetActualMiniBatchSize(maxMbSize);
+                net.BuildPTaskGraph();
+            }
+
+            for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
+            {
+                auto t_start_epoch = Timer::MilliSecondElapsed();
+
+                // set other information to inputMatrices that can contrain information
+                // used for class-based LM for clustring information
+                SetOtherInfo(net, trainSetDataReader, validationSetDataReader, inputMatrices);
+
+                //set dropout rate
+                SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+
+                //learning rate adjustment
+                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
+                {
+                    learnRatePerSample = m_learningRatesPerSample[i];
+                    setMomentum(m_momentumInputPerMB[i]);
+                }
+                else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+                {
+                    ElemType largestPrevLearnRatePerSample = prevLearnRates[0];
+                    for (int j = 1; j < m_numPrevLearnRates; j++)
+                    {
+                        largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
+                    }
+
+                    //return a reasonable  learning rate based on the initial mbsize
+                    learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes,
+                        labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);
+
+                    prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;  //save per sample learn rate to support changeable mbsize
+                }
+
+                learnRateInitialized = true;
+
+                if (learnRatePerSample < m_minLearnRate)
+                {
+                    fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
+                    if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
+                        net.SaveToFile(m_modelPath);
+                    break;
+                }
+
+#ifdef MPI_SUPPORT
+				INT32 mySamples = (INT32)
+#endif
+					fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n", (int)startEpoch,  learnRatePerSample, m_momentumPerMB);
+                TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
+                    criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
+                    epochCriterion, epochEvalErrors, totalSamplesSeen);
+
+                auto t_end_epoch = Timer::MilliSecondElapsed();
+                ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC);
+
+                fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
+                if (epochEvalErrors.size() == 1)
+                {
+                    fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime);
+                }
+                else
+                {
+                    fprintf(stderr, "EvalErr Per Sample ");
+                    for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                        fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
+                    fprintf(stderr, "Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", learnRatePerSample, epochTime);
+                    fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion);
+                    for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                        fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
+                }
+
+#ifdef MPI_SUPPORT
+                // model reduction and averaging
+                if (numProcs > 0)
+                {
+                    ElemType factor; // weight for the parameter of my model
+                    {
+                        // compute total minibatch size
+                        INT32 allSamples = 0;
+                        MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+                        if (allSamples == 0) allSamples = 1;
+
+                        factor = (ElemType)mySamples / (ElemType)allSamples;
+                    }
+
+                    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+                    {
+                        ComputationNodePtr node = (*nodeIter);
+                        Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
+
+                        // weight model by relative size of minibatch samples (and number of processors, for averaging)
+                        ElemType *px = mat.CopyToArray();
+                        size_t nx = mat.GetNumElements();
+                        transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; });
+
+                        // TODO: Replace default Allreduce with the reduction-shuffle-dance
+                        vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
+                        MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+                        mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
+                        delete px;
+                    }
+                }
+#endif
+
+                if ( 0 == myRank ) // only evaluate once, on the master process. TODO: This could be faster by farming out the validation parts
+                if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
+                {
+                    SimpleEvaluator<ElemType> evalforvalidation(net);
+                    vector<wstring> cvSetTrainAndEvalNodes;
+                    cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
+                    cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
+
+                    vector<ElemType> vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
+                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g  EvalErr Per Sample = %.8g\n",
+                            i + 1, vScore[0], vScore[1]);
+
+                    epochCriterion = vScore[0]; //the first one is the training criterion.
+                }
+#ifdef MPI_SUPPORT
+                // ensure all processes have the same epochCriterion
+                MPI_Bcast(&epochCriterion, 1, sizeof(epochCriterion) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD);
+#endif
+
+                bool loadedPrevModel = false;
+                size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
+                if (avgCriterion == std::numeric_limits<ElemType>::infinity())
+                    avgCriterion = epochCriterion;
+                else
+                    avgCriterion = ((epochsSinceLastLearnRateAdjust -1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
+
+                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+                {
+                    if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                    {
+                        if (m_loadBestModel)
+                        {
+                            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i-1), m_validateAfterModelReloading);
+                            net.ResetEvalTimeStamp();
+                            LoadCheckPointInfo(i-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);  
+                            fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
+                            loadedPrevModel = true;
+                        }
+                    }
+
+                    if(m_continueReduce)
+                    {
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        {
+                            if(learnRateReduced == false) 
+                            {
+                                learnRateReduced = true;                                
+                            }
+                            else 
+                            {
+                                if ( myRank == 0 )
+                                    net.SaveToFile(GetModelNameForEpoch(i, true));
+                                fprintf(stderr, "Finished training and saved final model\n\n");
+                                break;
+                            }
+                        }
+                        if(learnRateReduced) 
+                        {
+                            learnRatePerSample *= m_learnRateDecreaseFactor;
+                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                        }
+                    }
+                    else 
+                    {
+                        if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        {
+
+                            learnRatePerSample *= m_learnRateDecreaseFactor;
+                            fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                        }
+                        else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits<ElemType>::infinity())
+                        {
+                            learnRatePerSample *= m_learnRateIncreaseFactor;
+                            fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
+                        }
+                    }
+                }
+
+                if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)  //not loading previous values then set them
+                {
+                    prevCriterion = avgCriterion;
+                    epochsNotCountedInAvgCriterion = 0;
+                }
+
+                //persist model and check-point info
+                if (0 == myRank)
+                {
+                    net.SaveToFile(GetModelNameForEpoch(i));
+                    SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);
+                    if (!m_keepCheckPointFiles)
+                        _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());  //delete previous checkpiont file to save space
+                }
+
+                if (learnRatePerSample < 1e-12)
+                    fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample);
+            }
+
+            if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //since we linked feature nodes. we need to remove it from the deletion
+            {
+                for (size_t i=0; i<refFeatureNodes.size(); i++)
+                {
+                    refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]); //note we need to handle deletion carefully
+                }
+            }
+
+            if (inputMatrices[L"classinfo"])
+            {
+                delete inputMatrices[L"classinfo"];
+                inputMatrices.erase(L"classinfo");
+            }
+            if (inputMatrices[L"idx2cls"])
+            {
+                delete inputMatrices[L"idx2cls"];
+                inputMatrices.erase(L"idx2cls");
+            }
+
+        }
+
+    protected:
+
+        //return true if precomputation is executed.
+        bool PreCompute(ComputationNetwork<ElemType>& net,
+            IDataReader<ElemType>* trainSetDataReader, 
+            std::vector<ComputationNodePtr>& FeatureNodes,
+            std::vector<ComputationNodePtr>& labelNodes,
+            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices)
+        {
+            std::list<ComputationNodePtr> nodes = net.GetNodesRequirePreComputation();
+
+            if (nodes.size() == 0)
+            {
+                fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
+                return false;
+            }
+
+            fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size());
+            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+            {
+                PreComputedNode<ElemType>* node = static_cast<PreComputedNode<ElemType>*> (*nodeIter);
+                fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
+            }
+
+            //compute
+            //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize); 
+            // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
+			// [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing
+			if (m_useAllDataForPreComputedNode)
+				trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0); // using all the data
+			else 
+				trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0, m_epochSize); // using all the data
+
+            while (trainSetDataReader->GetMinibatch(inputMatrices))
+            {
+                UpdateEvalTimeStamps(FeatureNodes);
+                UpdateEvalTimeStamps(labelNodes);
+
+                size_t actualMBSize = net.GetActualMBSize();
+                net.SetActualMiniBatchSize(actualMBSize);
+                net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
+                trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd);
+
+                for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+                {
+                    net.Evaluate( *nodeIter);
+                }
+            }
+
+            //mark done
+            for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+            {
+                PreComputedNode<ElemType>* node = static_cast<PreComputedNode<ElemType>*> (*nodeIter);
+                node->MarkComputed(true);
+            }
+
+            return true;
+        }
+
+        //return a reasonable initial learning rate based on the initial mbsize
+        ElemType SearchLearnRateBeforeEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
+            const int epochNumber, const ElemType curLearnRate, 
+            IDataReader<ElemType>* trainSetDataReader, 
+            const std::vector<ComputationNodePtr>& FeatureNodes,
+            const std::vector<ComputationNodePtr>& labelNodes,
+            const std::vector<ComputationNodePtr>& criterionNodes,
+            const std::vector<ComputationNodePtr>& evaluationNodes,
+            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+            const std::list<ComputationNodePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradients, const bool /*learnRateInitialized*/, const ElemType largestPrevLearnRatePerSample)
+        {
+            ElemType epochCriterion = std::numeric_limits<ElemType>::infinity(), prevCriterion = std::numeric_limits<ElemType>::infinity();
+            vector<ElemType> epochEvalErrors(evaluationNodes.size(),std::numeric_limits<ElemType>::infinity());
+            //ElemType epochEvalError = std::numeric_limits<ElemType>::infinity();
+            size_t totalSamplesSeen = 0;
+            ElemType bestLearnRatePerSample = curLearnRate;
+
+            size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
+            if (m_epochSize != requestDataSize)
+            {
+                epochSize = min(epochSize, m_epochSize);  //use a small number minibatches to make decision
+            }
+
+            ElemType baseCriterion;
+
+            ElemType minLearnRate = m_minLearnRate * 0.3f;
+            ElemType learnRatePerSample = 1.0f / 8.0f / 0.618f /sqrt((ElemType)m_mbSize[epochNumber]);
+
+            if (largestPrevLearnRatePerSample != std::numeric_limits<ElemType>::infinity())
+                learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;  //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
+
+            int baseModelEpoch =  epochNumber-1;
+            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
+            net.ResetEvalTimeStamp();
+
+            ElemType learnRate =learnRatePerSample;
+            LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);  
+
+            //if model is not changed this is what we will get
+            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, 0,
+                FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
+                smoothedGradients, baseCriterion, epochEvalErrors, totalSamplesSeen);
+
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+            {
+                if (prevCriterion == std::numeric_limits<ElemType>::infinity())
+                    prevCriterion = baseCriterion;
+                ElemType ratio = 0.3f;
+                if (m_epochSize != requestDataSize)
+                {
+                    ratio = pow(((ElemType)epochSize) / m_epochSize, 1.0f/2);
+                }
+                baseCriterion = max(ratio * prevCriterion + (1-ratio) * baseCriterion, baseCriterion);
+            }
+
+            do
+            {
+                learnRatePerSample *= 0.618f;
+                TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,
+                    FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
+                    smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen);
+
+            } while (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate);
+
+            bestLearnRatePerSample =  learnRatePerSample;
+
+            if (epochNumber < m_numBestSearchEpoch) //grid search for the first m_numBestSearchEpoch  epochs
+            {
+                ElemType leftLearnRatePerSample = 0.01f / m_mbSize[epochNumber], rightLearnRatePerSample = learnRatePerSample;
+                ElemType leftCriterion, rightCriterion = epochCriterion;
+
+                TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample,
+                    FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
+                    smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen);
+
+                while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f)
+                {   
+                    if (rightCriterion > leftCriterion)
+                    {
+                        rightLearnRatePerSample *= 0.618f;
+
+                        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, rightLearnRatePerSample,
+                            FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
+                            smoothedGradients, rightCriterion, epochEvalErrors, totalSamplesSeen);
+                    }
+                    else
+                    {
+                        leftLearnRatePerSample /= 0.618f;
+
+                        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample,
+                            FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes,
+                            smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen);
+                    }
+                }
+
+                bestLearnRatePerSample =  (leftCriterion < rightCriterion)? leftLearnRatePerSample : rightLearnRatePerSample;
+            }
+
+            fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n", epochNumber+1, bestLearnRatePerSample, baseCriterion);
+
+            return bestLearnRatePerSample;
+        }
+
+        void TrainOneMiniEpochAndReloadModel(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
+            const int epochNumber,const  size_t epochSize, IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
+            const std::vector<ComputationNodePtr>& FeatureNodes,
+            const std::vector<ComputationNodePtr>& labelNodes,
+            const std::vector<ComputationNodePtr>& criterionNodes,
+            const std::vector<ComputationNodePtr>& evaluationNodes,
+            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+            const std::list<ComputationNodePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradients,
+            ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors, size_t& totalSamplesSeen)
+        {
+            TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes,
+                criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients,
+                epochCriterion, epochEvalErrors, totalSamplesSeen); 
+            fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g    ", epochCriterion);
+            if (epochEvalErrors.size()==1)
+                fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g\n", epochEvalErrors[0], learnRatePerSample);
+            else
+            {
+                fprintf(stderr, "EvalErr Per Sample ");
+                for (size_t i=0; i<epochEvalErrors.size(); i++)
+                    fprintf(stderr, "[%lu] = %.8g ", i, epochEvalErrors[i]);
+                fprintf(stderr, "Ave Learn Rate Per Sample = %.10g\n",learnRatePerSample);
+            }
+
+            int baseModelEpoch =  epochNumber-1;
+            net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
+            net.ResetEvalTimeStamp();
+
+            ElemType learnRate;
+            ElemType prevCriterion;
+            LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);  
+        }
+
+        size_t TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
+            const int epochNumber, const size_t epochSize, 
+            IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
+            const std::vector<ComputationNodePtr>& FeatureNodes,
+            const std::vector<ComputationNodePtr>& labelNodes,
+            const std::vector<ComputationNodePtr>& criterionNodes,
+            const std::vector<ComputationNodePtr>& evaluationNodes,
+            std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+            const std::list<ComputationNodePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradients,
+            ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors, size_t& totalSamplesSeen)
+        {
+            ElemType readTimeInMBs = 0, ComputeTimeInMBs = 0, epochCriterionLastMBs = 0;
+            int numSamplesLastMBs = 0;
+            std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(),0);
+            PTaskGraphBuilder<ElemType>* ptaskGraphBuilder = NULL;
+            
+            unsigned long long startReadMBTime = 0, startComputeMBTime=0;
+            unsigned long long  endReadMBTime = 0, endComputeMBTime = 0;
+
+            //initialize statistics
+            size_t totalEpochSamples = 0;
+
+            int numMBsRun = 0;
+            bool beginEpoch = true;
+
+            size_t numEvalNodes = epochEvalErrors.size();
+
+            // NOTE: the following two local matrices are not used in PTask path
+            Matrix<ElemType> localEpochCriterion(1,1,net.GetDeviceID()); //assume only one training criterion node for each epoch
+            Matrix<ElemType> localEpochEvalErrors(1,numEvalNodes,net.GetDeviceID());
+
+            localEpochCriterion.SetValue(0);
+            localEpochEvalErrors.SetValue(0);
+
+            if (m_usePtask)
+            {
+                epochCriterion = ElemType(0.0);
+                epochEvalErrors.assign(numEvalNodes, ElemType(0.0));
+            }
+
+            trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize);
+
+            // build the PTask graph if they want to use ptask
+            // NOTE: the graph is currently only for training, so other operations will still use the usual method, 
+            // (i.e rate adjustment, regularization and other custom operations still use the non PTask method)
+            if (m_usePtask)
+            {
+                ptaskGraphBuilder = net.GetPTaskGraphBuilder();
+                ptaskGraphBuilder->UpdateParameters(this, learnRatePerSample, m_mbSize[epochNumber]);
+                ptaskGraphBuilder->StartPTaskGraph();
+
+                // currently CNTK likes to keep things on the GPU, and PTask expects things to be on the CPU, so tell CNTK to keep data on the CPU
+                for (std::pair<std::wstring, Matrix<ElemType>*> inpair : inputMatrices)
+                {
+                    Matrix<ElemType>* mat = inpair.second;
+                    mat->SetPreferredDeviceId(CPUDEVICE);
+                    mat->TransferFromDeviceToDevice(mat->GetDeviceId(), CPUDEVICE, true);
+                }
+            }
+            
+            startReadMBTime=Timer::MilliSecondElapsed();
+            while (trainSetDataReader->GetMinibatch(inputMatrices))
+            {
+#ifdef MPI_SUPPORT
+                DecimateMinibatch(inputMatrices);
+#endif
+                endReadMBTime=Timer::MilliSecondElapsed();
+                startComputeMBTime=Timer::MilliSecondElapsed();
+
+                UpdateEvalTimeStamps(FeatureNodes);
+                UpdateEvalTimeStamps(labelNodes);
+
+                size_t actualMBSize = net.GetActualMBSize();
+                if (0 == actualMBSize)
+                    continue;
+
+                net.SetActualMiniBatchSize(actualMBSize);
+                net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
+                trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); 
+
+#ifndef EVALDLL
+                if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
+                {
+                     throw std::logic_error("cannot pass gradient checker");
+                }
+#endif
+                if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //TODO: currently only support one node regularization
+                {
+                    refNet.SetActualMiniBatchSize(actualMBSize);
+                    refNet.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
+                    refNet.Evaluate(refNode);
+                    Matrix<ElemType>::ScaleAndAdd(m_adaptationRegWeight, refNode->FunctionValues(), 1-m_adaptationRegWeight, labelNodes[0]->FunctionValues()); 
+                }
+                   
+                if (m_usePtask)
+                {
+                    // Pushing data in the graph starts things going
+                    bool endOfEpoch = trainSetDataReader->DataEnd(endDataEpoch);
+                    CONTROLSIGNAL signal = beginEpoch?DBCTLC_BOF:DBCTLC_NONE;
+                    if (endOfEpoch)
+                        signal |= DBCTLC_EOF;
+
+                    ptaskGraphBuilder->PushData(inputMatrices, signal);
+                    ptaskGraphBuilder->PushActualMBSize(learnableNodes, net.GetActualMBSize(), signal);
+                    beginEpoch = false; // clear this out after first epoch
+
+                    // pull the values from the graph for the totals
+                    epochCriterion += ptaskGraphBuilder->GetValue(criterionNodes[0]);
+                    for (size_t i=0; i<numEvalNodes; i++)
+                    {
+                        epochEvalErrors[i] += ptaskGraphBuilder->GetValue(evaluationNodes[i]);
+                    }
+
+                    // NOTE: update model parameters is part of the graph, so nothing to do here
+                }
+                else
+                {
+                    if (learnRatePerSample > m_minLearnRate * 0.01)  //only compute gradient when learning rate is large enough
+                        net.ComputeGradient(criterionNodes[0]);  //use only the first criterion. Is there any possibility to use more?
+                    else
+                        net.Evaluate(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more?
+
+                    Matrix<ElemType>::AddElementToElement(criterionNodes[0]->FunctionValues(), 0, 0, localEpochCriterion, 0, 0);
+
+                    std::vector<ElemType>mbEvalErrors(numEvalNodes,0);
+                    for (size_t i=0; i<numEvalNodes; i++)
+                    {
+                        net.Evaluate(evaluationNodes[i]);
+                        Matrix<ElemType>::AddElementToElement(evaluationNodes[i]->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i);
+                    }
+
+                    //update model parameters
+                    if (learnRatePerSample > m_minLearnRate * 0.01)
+                    {
+                        auto smoothedGradientIter=smoothedGradients.begin();
+                        for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
+                        {
+                            ComputationNodePtr node = (*nodeIter);
+                            Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
+
+                            UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber]);
+                        }                    
+                    }
+                }
+
+
+                endComputeMBTime=Timer::MilliSecondElapsed();
+                numMBsRun ++;
+                if (m_traceLevel > 0)
+                {
+                    ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(MS_PER_SEC);
+                    ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/MS_PER_SEC;
+
+                    readTimeInMBs += MBReadTime;
+                    ComputeTimeInMBs += MBComputeTime;
+                    numSamplesLastMBs += int(actualMBSize);
+
+                    if (numMBsRun % m_numMBsToShowResult == 0)
+                    {
+                        if (!m_usePtask)
+                        {   // get the epoch Values updated, in PTask don't use the loclEpoch* temporary matrices
+                            epochCriterion = localEpochCriterion.Get00Element();
+                            for (size_t i=0; i< numEvalNodes; i++)
+                                epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
+                        }
+
+                        fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d    Train Loss Per Sample = %.8g    ",epochNumber+1, numMBsRun-m_numMBsToShowResult+1, numMBsRun, numSamplesLastMBs,
+                            (epochCriterion-epochCriterionLastMBs)/numSamplesLastMBs);
+                        for (size_t i=0; i<numEvalNodes; i++){
+                            fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g    ",i,(epochEvalErrors[i]-epochEvalErrorsLastMBs[i])/numSamplesLastMBs);
+                        }
+                        fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n", readTimeInMBs, ComputeTimeInMBs, (readTimeInMBs + ComputeTimeInMBs)/numSamplesLastMBs);
+                                                    
+                        //reset statistics
+                        readTimeInMBs = ComputeTimeInMBs = 0;
+                        numSamplesLastMBs = 0; 
+
+                        epochCriterionLastMBs = epochCriterion;
+                        for (size_t i=0; i< numEvalNodes; i++)
+                            epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
+                    }
+                }
+                startReadMBTime=Timer::MilliSecondElapsed();
+                totalEpochSamples += actualMBSize;
+                totalSamplesSeen += actualMBSize;
+
+                if (totalEpochSamples >= epochSize)
+                    break;
+
+                /// call DataEnd function 
+                /// DataEnd does reader specific process if sentence ending is reached
+                trainSetDataReader->DataEnd(endDataSentence);
+
+            }
+
+            if (m_usePtask)
+            {
+                // when the epoch is complete, we need to transfer all the values back to the LearnableNodes, which will be saved off as the model
+                std::list<ComputationNodePtr> learnableNodes = net.LearnableNodes(criterionNodes[0]);
+                for (ComputationNodePtr node : learnableNodes)
+                {
+                    ptaskGraphBuilder->GetValue(node, node->FunctionValues());
+                }
+                epochCriterion /= float(totalEpochSamples);
+                for (size_t i=0; i< numEvalNodes; i++)
+                {
+                    epochEvalErrors[i] /= float(totalEpochSamples);
+                }
+            }
+            else
+            {
+                localEpochCriterion /= float(totalEpochSamples);
+                localEpochEvalErrors /= float(totalEpochSamples);
+
+                epochCriterion = localEpochCriterion.Get00Element();
+                for (size_t i=0; i< numEvalNodes; i++)
+                {
+                    epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
+                }
+            }
+            return totalEpochSamples;
+        }
+public:
+        // UpdateWeightsS - static version of UpdateWeights()
+        static void UpdateWeightsS(const SGD* sgd, Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize)
+        {
+#if DUMPOUTPUT
+            fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize);
+            fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB());
+            gradientValues.Print("Gradient Input");
+            smoothedGradient.Print("Smoothed Gradient Input");
+#endif
+
+            // make actualMBSize is a valid value
+            assert(actualMBSize > 0);
+
+            //clipping gradients to prevent outliers
+            sgd->ClipGradient(gradientValues, actualMBSize);
+
+            GradientsUpdateType adpType = sgd->GradUpdateType();
+            ElemType noiseStd = sgd->GradientUpdateNoiseStd();
+            Matrix<ElemType> sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId());
+            if (noiseStd > 0)
+            {
+                sgdUpdateNoise.SetValue(gradientValues);  /// get the gradient structure since gradient is sparse
+                sgdUpdateNoise.SetGaussianRandomValue(0, noiseStd); // reset its value to random 
+            }
+
+            if (adpType == GradientsUpdateType::None)
+            {
+                ElemType momentum = sgd->MomentumPerMB(); 
+                if (actualMBSize < expectedMBSize && momentum > 0.0000001f)  //we use simple linear (instead of log linear) scaling here
+                {
+                    momentum = (ElemType) exp (log(momentum)/expectedMBSize * actualMBSize);
+                }
+                smoothedGradient.NormalGrad(gradientValues, functionValues, learnRatePerSample, momentum);
+            }
+            if (adpType == GradientsUpdateType::AdaGrad)
+            {
+                smoothedGradient.Adagrad(gradientValues);
+                Matrix<ElemType>::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues);
+            }
+            if (adpType == GradientsUpdateType::RmsProp)
+            {
+                // include L2 regularizer
+                Matrix<ElemType>::ScaleAndAdd((ElemType)0.001, functionValues, gradientValues);
+                smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma, (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max, (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min);
+                Matrix<ElemType>::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues);
+            }
+
+            if (noiseStd > 0)
+            {
+                Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
+            }
+#if DUMPOUTPUT
+            functionValues.Print("Parameter Update");
+#endif
+        }
+protected:
+        // UpdateWeights - update the weights in 
+        void UpdateWeights(const ComputationNodePtr node, Matrix<ElemType>& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize) const
+        {
+#if DUMPOUTPUT
+            fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
+#endif
+            UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize);
+            node->UpdateEvalTimeStamp();
+        }
+
+        void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+        {
+            if (m_clippingThresholdPerSample != std::numeric_limits<ElemType>::infinity())
+            {
+                ElemType maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
+                if (m_gradientClippingWithTruncation)
+                {
+                    gradient.InplaceTruncate(maxGradientPerMB);
+                }
+                else //norm2 normalized
+                {
+                    ElemType gradientNorm = gradient.FrobeniusNorm();
+                    if (gradientNorm > maxGradientPerMB)
+                    {
+                        ElemType normFactor =  maxGradientPerMB / gradientNorm;
+                        gradient *= normFactor;
+                    }
+                }
+            }
+        }
+
+        void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, 
+            const std::list<Matrix<ElemType>>& smoothedGradients, const ElemType prevCriterion)
+        {
+            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
+
+            File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+            fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+
+            for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+            {
+                const Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
+                fstream << smoothedGradient;
+            }
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+        }
+
+        bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, ElemType& learnRatePerSample, 
+            std::list<Matrix<ElemType>>& smoothedGradients, ElemType& prevCriterion)
+        {
+            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
+            if (!fexists(checkPointFileName.c_str()) )
+            {
+                fprintf(stderr, "Warning: checkpiont file is missing. learning parameters will be initialized from 0\n");
+                return false;
+            }
+
+            File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+            fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+
+            for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+            {
+                Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
+                fstream >> smoothedGradient;
+            }
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+
+            return true;
+        }
+
+        wstring GetCheckPointFileNameForEpoch (const int epoch)
+        {
+            return GetModelNameForEpoch (epoch) + L".ckp";
+        }
+
+        wstring GetModelNameForEpoch (const int epoch, bool bLastModel = false)
+        {
+            int epoch1Base = epoch + 1;
+            if (epoch1Base == m_maxEpochs || bLastModel) 
+                return m_modelPath;          
+            else 
+                return msra::strfun::wstrprintf (L"%s.%d", m_modelPath.c_str(), (int) epoch1Base);
+        } 
+
+        //return -1 if nothing exists
+        int DetermineStartEpoch (const bool makeMode)
+        {
+            if (!makeMode)
+                return -1;  //always start from scratch
+
+            int firstEpoch = -1;
+
+            wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs)-1);
+            for (int e = int(m_maxEpochs)-1; e >= -1; e--)
+            {
+                const wstring prevEpochFile = GetModelNameForEpoch (e-1);
+
+                if (msra::files::fuptodate (curEpochFile, prevEpochFile, false))
+                {
+                    firstEpoch = size_t(e)+1;
+                    break;
+                }
+                else
+                    curEpochFile = prevEpochFile;
+            }
+
+            return firstEpoch;
+        }
+
+        AdaptationRegType ParseAdaptationRegType(wstring s)
+        {
+            msra::strfun::tolower_ascii(s);
+            if (s == L"" || s == L"none")
+                return AdaptationRegType::None;
+            else if (s == L"kl" || s == L"klreg" )
+                return AdaptationRegType::KL;
+            else
+                throw std::invalid_argument(
+                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
+                "(None | KL)");
+        }
+
+        GradientsUpdateType ParseGradUpdateType(wstring s)
+        {
+            msra::strfun::tolower_ascii(s);
+            if (s == L"" || s == L"none")
+                return GradientsUpdateType::None;
+            else if (s == L"adagrad")
+                return GradientsUpdateType::AdaGrad;
+            else if (s == L"rmsprop")
+                return GradientsUpdateType::RmsProp;
+            else
+                throw std::invalid_argument(
+                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
+                "(None | AdaGrad | RmsProp )");
+        }
+
+        LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
+        {
+            msra::strfun::tolower_ascii(s);
+            if (s == L"false" || s == L"none")
+                return LearningRateSearchAlgorithm::None;
+            else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
+                return LearningRateSearchAlgorithm::SearchBeforeEpoch;
+            else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
+                return LearningRateSearchAlgorithm::AdjustAfterEpoch;
+            else
+                throw std::invalid_argument(
+                "autoAdjustLR: Invalid learning rate search type. Valid values are "
+                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
+        }
+
+        GradientsUpdateType GradUpdateType() const {return m_gradType.mType;}
+        ElemType GradientUpdateNoiseStd() const {return m_gradType.mGaussianNoiseInjectStd;}
+        ElemType MomentumPerMB() const {return m_momentumPerMB;}
+
+    public:
+        #define EPSILON 1e-5
+
+        bool GradientCheck(
+            ComputationNetwork<ElemType>& net,
+            const std::vector<ComputationNodePtr>& criterionNodes,
+            const std::list<ComputationNodePtr>& learnableNodes,
+            int npos)
+        {
+            // gradient checking
+            for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+            {
+                ComputationNodePtr node = (*nodeIter);
+
+                int irow = (int)fmod(rand(), node->FunctionValues().GetNumRows()-1);
+                int icol = (int)fmod(rand(), node->FunctionValues().GetNumCols()-1);
+                irow = max(0, irow);
+                icol = max(0, icol);
+
+                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
+                // node->FunctionValues().Print();
+                ElemType eOrg = node->FunctionValues()(irow,icol);
+
+                node->UpdateEvalTimeStamp();
+                net.ComputeGradient(criterionNodes[npos]);  //use only the first criterion. Is 
+                //ElemType mbEvalCri =
+                criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+                ElemType eGradErr = node->GradientValues()(irow, icol); 
+
+                ElemType ePos = eOrg + ElemType(EPSILON);
+                ElemType eNeg = eOrg - ElemType(EPSILON);
+
+                node->FunctionValues()(irow, icol) = ePos;
+                node->UpdateEvalTimeStamp();
+                net.Evaluate(criterionNodes[npos]); 
+                ElemType mbEvalCriPos = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+                
+                node->FunctionValues()(irow, icol) = eNeg;
+                node->UpdateEvalTimeStamp();
+                net.Evaluate(criterionNodes[npos]); 
+                ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+
+                // back to its orginal parameter value
+                node->FunctionValues()(irow, icol) = eOrg; 
+
+                // check if they are consistent
+                ElemType eGradNum = (ElemType)((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
+                ElemType threshold = (ElemType)pow((ElemType)10.0, max((ElemType)0.0, ceil(log10(min(fabs(eGradErr), fabs(eGradNum))))) - (int)m_gradientCheckSigDigit);
+                ElemType diff = (ElemType)fabs(eGradErr - eGradNum);
+                bool wrong = (std::isnan(diff) || diff > threshold);
+                if (wrong)
+                {
+                    fprintf (stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n", node->NodeName().c_str(), eGradNum, eGradErr);
+                    return false; 
+                }
+            }
+
+            return true;
+        }
+
+        void SetOtherInfo(ComputationNetwork<ElemType>& net , IDataReader<ElemType>* /*trainSetDataReader*/, IDataReader<ElemType>* /*validSetDataReader*/, std::map<std::wstring, Matrix<ElemType>*>& inputMatrices)
+        {
+            std::vector<ComputationNodePtr> criterionNodes = net.FinalCriterionNodes();
+            std::vector<ComputationNodePtr> evaluationNodes = net.EvaluationNodes();
+
+            //initializing weights and gradient holder
+            for (size_t i = 0; i < criterionNodes.size(); i++)
+            {
+                if (criterionNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
+                {
+                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) criterionNodes[i];
+                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
+                }
+            }
+
+            for (size_t i=0;i<evaluationNodes.size(); i++)
+            {
+                if (evaluationNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
+                {
+                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evaluationNodes[i];
+                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
+                }
+            }
+        }
+
+    protected:
+
+        floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside
+		bool			m_needToNormalizeLRByParallUtterance;			// only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
+        intargvector m_mbSize;
+        size_t m_epochSize;
+        size_t m_maxEpochs;
+        floatargvector m_momentumInputPerMB;
+        ElemType m_momentumPerMB;
+        bool m_gradientClippingWithTruncation;
+        ElemType m_clippingThresholdPerSample;
+
+        wstring m_modelPath;
+        wstring m_trainCriterionNodeName;
+        wstring m_evalCriterionNodeName;
+
+        intargvector m_numMiniBatch4LRSearch;
+        size_t m_numBestSearchEpoch;
+
+        LearningRateSearchAlgorithm m_autoLearnRateSearchType;
+
+        AdaptationRegType m_adaptationRegType;
+        ElemType m_adaptationRegWeight;
+        bool m_needRegularization;
+
+        bool m_loadBestModel;
+        ElemType m_reduceLearnRateIfImproveLessThan;
+        bool m_continueReduce;
+        size_t m_learnRateAdjustInterval; //determine after how many epochs the learning rate should be auto adjusted.
+        ElemType m_increaseLearnRateIfImproveMoreThan;
+        ElemType m_learnRateIncreaseFactor;
+        ElemType m_learnRateDecreaseFactor;
+
+        floatargvector m_dropoutRates;
+        size_t m_maxTempMemSizeInSamplesForCNN;
+
+        int m_traceLevel;
+
+        size_t m_numPrevLearnRates;
+
+        ElemType m_minLearnRate;
+
+        GradientUpdateInfo m_gradType;
+        RMSPropInfo m_rpi;
+
+        bool m_usePtask;
+
+        bool m_keepCheckPointFiles;
+
+        int m_numMBsToShowResult;
+
+        bool m_doGradientCheck;
+        ElemType m_gradientCheckSigDigit;
+
+        bool m_validateAfterModelReloading;
+
+		bool m_useAllDataForPreComputedNode;
+    };
+    template class SGD<float>; 
+    template class SGD<double>;
+
+}}}
diff --git a/MachineLearning/cn/SimpleEvaluator.h b/MachineLearning/cn/SimpleEvaluator.h
index 6c53a09b9..977cefde6 100644
--- a/MachineLearning/cn/SimpleEvaluator.h
+++ b/MachineLearning/cn/SimpleEvaluator.h
@@ -1,350 +1,349 @@
-//
-// <copyright file="SimpleEvaluator.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include "ComputationNetwork.h"
-#include "ComputationNetworkHelper.h"
-#include "DataReader.h"
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include "basetypes.h"
-#include "fileutil.h"
-#include "commandArgUtil.h"
-#include <fstream>
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    template<class ElemType>
-    class SimpleEvaluator : ComputationNetworkHelper<ElemType>
-    {
-        typedef ComputationNetworkHelper<ElemType> B;
-        using B::UpdateEvalTimeStamps;
-    protected:
-        typedef ComputationNode<ElemType>* ComputationNodePtr;
-        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
-
-    public:
-
-        SimpleEvaluator(ComputationNetwork<ElemType>& net,  const size_t numMBsToShowResult=100, const int traceLevel=0) 
-            : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel)
-        {
-        }
-
-        //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
-        vector<ElemType> Evaluate(IDataReader<ElemType>& dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize,  const size_t testSize=requestDataSize)
-        {
-            //specify evaluation nodes
-            std::vector<ComputationNodePtr> evalNodes;
-
-            if (evalNodeNames.size() == 0)
-            {
-                fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n");
-                if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0)
-                    throw std::logic_error("There is no default evalnodes or training criterion node specified in the network.");
-            
-                for (int i=0; i< m_net.EvaluationNodes().size(); i++)
-                    evalNodes.push_back(m_net.EvaluationNodes()[i]);
-
-                for (int i=0; i< m_net.FinalCriterionNodes().size(); i++)
-                    evalNodes.push_back(m_net.FinalCriterionNodes()[i]);
-            }
-            else
-            {
-                for (int i=0; i<evalNodeNames.size(); i++)
-                {
-                    ComputationNodePtr node = m_net.GetNodeFromName(evalNodeNames[i]);
-                    m_net.BuildAndValidateNetwork(node);
-                    if (!node->FunctionValues().GetNumElements() == 1)
-                    {
-                        throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value).");
-                    }
-                    evalNodes.push_back(node);
-                }
-            }
-
-            //initialize eval results
-            std::vector<ElemType> evalResults;
-            for (int i=0; i< evalNodes.size(); i++)
-            {
-                evalResults.push_back((ElemType)0);
-                evalNodes[i]->Reset();
-            }
-
-            //prepare features and labels
-            std::vector<ComputationNodePtr> & FeatureNodes = m_net.FeatureNodes();
-            std::vector<ComputationNodePtr> & labelNodes = m_net.LabelNodes();
-
-            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
-            for (size_t i=0; i<FeatureNodes.size(); i++)
-            {
-                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
-            }
-            for (size_t i=0; i<labelNodes.size(); i++)
-            {
-                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
-            }
-
-            //evaluate through minibatches
-            size_t totalEpochSamples = 0;            
-            size_t numMBsRun = 0;
-            size_t actualMBSize = 0;
-            size_t numSamplesLastMBs = 0;
-            size_t lastMBsRun = 0; //MBs run before this display
-
-            std::vector<ElemType> evalResultsLastMBs;
-            for (int i=0; i< evalResults.size(); i++)
-                evalResultsLastMBs.push_back((ElemType)0);
-
-            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
-            dataReader.SetNbrSlicesEachRecurrentIter(1);
-
-            for (int i=0; i<evalNodes.size(); i++)
-            {
-                if (evalNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
-                {
-                    size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
-                    if(inputMatrices.find(L"classinfo") == inputMatrices.end())
-                    {
-                        inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
-                        inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
-                    }
-                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i];
-                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
-                }
-            }
-
-            while (dataReader.GetMinibatch(inputMatrices))
-            {
-                UpdateEvalTimeStamps(FeatureNodes);
-                UpdateEvalTimeStamps(labelNodes);
-
-                actualMBSize = m_net.GetActualMBSize();
-                m_net.SetActualMiniBatchSize(actualMBSize);
-                m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter());
-                dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); 
-
-                for (int i=0; i<evalNodes.size(); i++)
-                {
-                    m_net.Evaluate(evalNodes[i]);
-                    evalResults[i] += evalNodes[i]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-                }
-
-                totalEpochSamples += actualMBSize;
-                numMBsRun++;
-
-                if (m_traceLevel > 0)
-                {
-                    numSamplesLastMBs += actualMBSize; 
-
-                if (numMBsRun % m_numMBsToShowResult == 0)
-                {
-                        DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
-
-                        for (int i=0; i<evalResults.size(); i++)
-                        {
-                            evalResultsLastMBs[i] = evalResults[i];
-                        }
-                        numSamplesLastMBs = 0; 
-                        lastMBsRun = numMBsRun;
-                    }
-                }
-
-                /// call DataEnd to check if end of sentence is reached
-                /// datareader will do its necessary/specific process for sentence ending 
-                dataReader.DataEnd(endDataSentence); 
-            }
-
-            // show last batch of results
-            if (m_traceLevel > 0 && numSamplesLastMBs > 0)
-            {
-                  DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
-            }
-            
-            //final statistics
-            for (int i=0; i<evalResultsLastMBs.size(); i++)
-            {
-                evalResultsLastMBs[i] = 0;
-            }
-
-            fprintf(stderr,"Final Results: ");
-            DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs);
-            
-            for (int i=0; i<evalResults.size(); i++)
-            {
-                evalResults[i] /= totalEpochSamples;
-            }
-
-            if (inputMatrices[L"classinfo"])
-            {
-                delete inputMatrices[L"classinfo"];
-                inputMatrices.erase(L"classinfo");
-            }
-            if (inputMatrices[L"idx2cls"])
-            {
-                delete inputMatrices[L"idx2cls"];
-                inputMatrices.erase(L"idx2cls");
-            }
-
-            return evalResults;
-        }        
-
-        //returns error rate
-        ElemType EvaluateUnroll(IDataReader<ElemType>& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
-        {
-
-            std::vector<ComputationNodePtr> FeatureNodes = m_net.FeatureNodes();
-            std::vector<ComputationNodePtr> labelNodes = m_net.LabelNodes();
-            std::vector<ComputationNodePtr> criterionNodes = m_net.FinalCriterionNodes();
-            std::vector<ComputationNodePtr> evaluationNodes = m_net.EvaluationNodes();
-            
-            if (criterionNodes.size()==0)
-            {
-                throw std::runtime_error("No CrossEntropyWithSoftmax node found\n");
-            }
-            if (evaluationNodes.size()==0)
-            {
-                throw std::runtime_error("No Evaluation node found\n");
-            }
-
-            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
-            for (size_t i=0; i<FeatureNodes.size(); i++)
-            {
-                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
-            }
-            for (size_t i=0; i<labelNodes.size(); i++)
-            {
-                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
-            }
-            inputMatrices[L"numberobs"] = new Matrix<ElemType>(1,1, m_net.GetDeviceID()); 
-
-            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
-
-            ElemType epochEvalError = 0;
-            ElemType epochCrossEntropy = 0;
-            size_t totalEpochSamples = 0;
-            ElemType prevEpochEvalError = 0;
-            ElemType prevEpochCrossEntropy = 0;
-            size_t prevTotalEpochSamples = 0;
-            size_t prevStart = 1;
-            size_t numSamples = 0;
-            ElemType crossEntropy = 0;
-            ElemType evalError = 0;
-            
-            ofstream outputStream;
-            if (output)
-            {
-#ifdef _MSC_VER
-                outputStream.open(output);
-#else
-                outputStream.open(charpath(output));    // GCC does not implement wide-char pathnames here
-#endif
-            }
-
-            size_t numMBsRun = 0;
-            size_t actualMBSize = 0;
-            while (dataReader.GetMinibatch(inputMatrices))
-            {
-                size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0);
-                actualMBSize = nbrSamples;
-
-                for (int npos = 0; npos < nbrSamples ; npos++)
-                {
-                    FeatureNodes[npos]->UpdateEvalTimeStamp();
-                    labelNodes[npos]->UpdateEvalTimeStamp();
-
-                    m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more?
-
-                    m_net.Evaluate(evaluationNodes[npos]);
-
-                    ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar
-                    epochCrossEntropy += mbCrossEntropy;
-
-                    ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-
-                    epochEvalError += mbEvalError;
-                }
-
-                totalEpochSamples += actualMBSize;
-
-                if (outputStream.is_open())
-                {
-                    //TODO: add support to dump multiple outputs
-                    ComputationNodePtr outputNode = m_net.OutputNodes()[0];
-                    foreach_column(j, outputNode->FunctionValues())
-                    {
-                        foreach_row(i,outputNode->FunctionValues())
-                        {
-                            outputStream<<outputNode->FunctionValues()(i,j)<<" ";
-                        }
-                        outputStream<<endl;
-                    }
-                }
-
-                numMBsRun++;
-                if (numMBsRun % m_numMBsToShowResult == 0)
-                {
-                    numSamples = (totalEpochSamples - prevTotalEpochSamples);
-                    crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
-                    evalError = epochEvalError - prevEpochEvalError;
-
-                    fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
-                            prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
-
-                    prevTotalEpochSamples = totalEpochSamples;
-                    prevEpochCrossEntropy = epochCrossEntropy;
-                    prevEpochEvalError = epochEvalError;
-                    prevStart = numMBsRun + 1;
-                }
-
-            }
-
-            // show final grouping of output
-            numSamples = totalEpochSamples - prevTotalEpochSamples;
-            if (numSamples > 0)
-            {
-                crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
-                evalError = epochEvalError - prevEpochEvalError;
-                fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
-                    prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
-            }
-
-            //final statistics
-            epochEvalError /= (ElemType)totalEpochSamples;
-            epochCrossEntropy /= (ElemType)totalEpochSamples;
-            fprintf(stderr, "Overall: Samples Evaluated = %lu   EvalErr Per Sample = %.8g   Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
-            if (outputStream.is_open())
-            {
-                outputStream.close();
-            }
-            evalSetCrossEntropy = epochCrossEntropy;
-            return epochEvalError;
-        }
-
-    protected:
-        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodePtr>& evalNodes, 
-            const vector<ElemType> & evalResults, const vector<ElemType> & evalResultsLastMBs)
-        {
-            fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);
-
-            for (size_t i=0; i<evalResults.size(); i++)
-            {
-                fprintf(stderr, "%ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-    protected: 
-        ComputationNetwork<ElemType>& m_net;
-        size_t m_numMBsToShowResult;
-        int m_traceLevel;
-        void operator=(const SimpleEvaluator&); // (not assignable)
-    };
-
-}}}
+//
+// <copyright file="SimpleEvaluator.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include "ComputationNetwork.h"
+#include "ComputationNetworkHelper.h"
+#include "DataReader.h"
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include "basetypes.h"
+#include "fileutil.h"
+#include "commandArgUtil.h"
+#include <fstream>
+
+using namespace std;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    template<class ElemType>
+    class SimpleEvaluator : ComputationNetworkHelper<ElemType>
+    {
+        typedef ComputationNetworkHelper<ElemType> B;
+        using B::UpdateEvalTimeStamps;
+    protected:
+        typedef ComputationNode<ElemType>* ComputationNodePtr;
+        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
+
+    public:
+
+        SimpleEvaluator(ComputationNetwork<ElemType>& net,  const size_t numMBsToShowResult=100, const int traceLevel=0) 
+            : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel)
+        {
+        }
+
+        //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
+        vector<ElemType> Evaluate(IDataReader<ElemType>& dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize,  const size_t testSize=requestDataSize)
+        {
+            //specify evaluation nodes
+            std::vector<ComputationNodePtr> evalNodes;
+
+            if (evalNodeNames.size() == 0)
+            {
+                fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n");
+                if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0)
+                    throw std::logic_error("There is no default evalnodes or training criterion node specified in the network.");
+            
+                for (int i=0; i< m_net.EvaluationNodes().size(); i++)
+                    evalNodes.push_back(m_net.EvaluationNodes()[i]);
+
+                for (int i=0; i< m_net.FinalCriterionNodes().size(); i++)
+                    evalNodes.push_back(m_net.FinalCriterionNodes()[i]);
+            }
+            else
+            {
+                for (int i=0; i<evalNodeNames.size(); i++)
+                {
+                    ComputationNodePtr node = m_net.GetNodeFromName(evalNodeNames[i]);
+                    m_net.BuildAndValidateNetwork(node);
+                    if (!node->FunctionValues().GetNumElements() == 1)
+                    {
+                        throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value).");
+                    }
+                    evalNodes.push_back(node);
+                }
+            }
+
+            //initialize eval results
+            std::vector<ElemType> evalResults;
+            for (int i=0; i< evalNodes.size(); i++)
+            {
+                evalResults.push_back((ElemType)0);
+                evalNodes[i]->Reset();
+            }
+
+            //prepare features and labels
+            std::vector<ComputationNodePtr> & FeatureNodes = m_net.FeatureNodes();
+            std::vector<ComputationNodePtr> & labelNodes = m_net.LabelNodes();
+
+            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
+            for (size_t i=0; i<FeatureNodes.size(); i++)
+            {
+                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
+            }
+            for (size_t i=0; i<labelNodes.size(); i++)
+            {
+                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
+            }
+
+            //evaluate through minibatches
+            size_t totalEpochSamples = 0;            
+            size_t numMBsRun = 0;
+            size_t actualMBSize = 0;
+            size_t numSamplesLastMBs = 0;
+            size_t lastMBsRun = 0; //MBs run before this display
+
+            std::vector<ElemType> evalResultsLastMBs;
+            for (int i=0; i< evalResults.size(); i++)
+                evalResultsLastMBs.push_back((ElemType)0);
+
+            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
+
+            for (int i=0; i<evalNodes.size(); i++)
+            {
+                if (evalNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
+                {
+                    size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
+                    if(inputMatrices.find(L"classinfo") == inputMatrices.end())
+                    {
+                        inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
+                        inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
+                    }
+                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i];
+                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
+                }
+            }
+
+            while (dataReader.GetMinibatch(inputMatrices))
+            {
+                UpdateEvalTimeStamps(FeatureNodes);
+                UpdateEvalTimeStamps(labelNodes);
+
+                actualMBSize = m_net.GetActualMBSize();
+                m_net.SetActualMiniBatchSize(actualMBSize);
+                m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter());
+                dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); 
+
+                for (int i=0; i<evalNodes.size(); i++)
+                {
+                    m_net.Evaluate(evalNodes[i]);
+                    evalResults[i] += evalNodes[i]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+                }
+
+                totalEpochSamples += actualMBSize;
+                numMBsRun++;
+
+                if (m_traceLevel > 0)
+                {
+                    numSamplesLastMBs += actualMBSize; 
+
+                if (numMBsRun % m_numMBsToShowResult == 0)
+                {
+                        DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+
+                        for (int i=0; i<evalResults.size(); i++)
+                        {
+                            evalResultsLastMBs[i] = evalResults[i];
+                        }
+                        numSamplesLastMBs = 0; 
+                        lastMBsRun = numMBsRun;
+                    }
+                }
+
+                /// call DataEnd to check if end of sentence is reached
+                /// datareader will do its necessary/specific process for sentence ending 
+                dataReader.DataEnd(endDataSentence); 
+            }
+
+            // show last batch of results
+            if (m_traceLevel > 0 && numSamplesLastMBs > 0)
+            {
+                  DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+            }
+            
+            //final statistics
+            for (int i=0; i<evalResultsLastMBs.size(); i++)
+            {
+                evalResultsLastMBs[i] = 0;
+            }
+
+            fprintf(stderr,"Final Results: ");
+            DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs);
+            
+            for (int i=0; i<evalResults.size(); i++)
+            {
+                evalResults[i] /= totalEpochSamples;
+            }
+
+            if (inputMatrices[L"classinfo"])
+            {
+                delete inputMatrices[L"classinfo"];
+                inputMatrices.erase(L"classinfo");
+            }
+            if (inputMatrices[L"idx2cls"])
+            {
+                delete inputMatrices[L"idx2cls"];
+                inputMatrices.erase(L"idx2cls");
+            }
+
+            return evalResults;
+        }        
+
+        //returns error rate
+        ElemType EvaluateUnroll(IDataReader<ElemType>& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
+        {
+
+            std::vector<ComputationNodePtr> FeatureNodes = m_net.FeatureNodes();
+            std::vector<ComputationNodePtr> labelNodes = m_net.LabelNodes();
+            std::vector<ComputationNodePtr> criterionNodes = m_net.FinalCriterionNodes();
+            std::vector<ComputationNodePtr> evaluationNodes = m_net.EvaluationNodes();
+            
+            if (criterionNodes.size()==0)
+            {
+                throw std::runtime_error("No CrossEntropyWithSoftmax node found\n");
+            }
+            if (evaluationNodes.size()==0)
+            {
+                throw std::runtime_error("No Evaluation node found\n");
+            }
+
+            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
+            for (size_t i=0; i<FeatureNodes.size(); i++)
+            {
+                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
+            }
+            for (size_t i=0; i<labelNodes.size(); i++)
+            {
+                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
+            }
+            inputMatrices[L"numberobs"] = new Matrix<ElemType>(1,1, m_net.GetDeviceID()); 
+
+            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
+
+            ElemType epochEvalError = 0;
+            ElemType epochCrossEntropy = 0;
+            size_t totalEpochSamples = 0;
+            ElemType prevEpochEvalError = 0;
+            ElemType prevEpochCrossEntropy = 0;
+            size_t prevTotalEpochSamples = 0;
+            size_t prevStart = 1;
+            size_t numSamples = 0;
+            ElemType crossEntropy = 0;
+            ElemType evalError = 0;
+            
+            ofstream outputStream;
+            if (output)
+            {
+#ifdef _MSC_VER
+                outputStream.open(output);
+#else
+                outputStream.open(charpath(output));    // GCC does not implement wide-char pathnames here
+#endif
+            }
+
+            size_t numMBsRun = 0;
+            size_t actualMBSize = 0;
+            while (dataReader.GetMinibatch(inputMatrices))
+            {
+                size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0);
+                actualMBSize = nbrSamples;
+
+                for (int npos = 0; npos < nbrSamples ; npos++)
+                {
+                    FeatureNodes[npos]->UpdateEvalTimeStamp();
+                    labelNodes[npos]->UpdateEvalTimeStamp();
+
+                    m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more?
+
+                    m_net.Evaluate(evaluationNodes[npos]);
+
+                    ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar
+                    epochCrossEntropy += mbCrossEntropy;
+
+                    ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+
+                    epochEvalError += mbEvalError;
+                }
+
+                totalEpochSamples += actualMBSize;
+
+                if (outputStream.is_open())
+                {
+                    //TODO: add support to dump multiple outputs
+                    ComputationNodePtr outputNode = m_net.OutputNodes()[0];
+                    foreach_column(j, outputNode->FunctionValues())
+                    {
+                        foreach_row(i,outputNode->FunctionValues())
+                        {
+                            outputStream<<outputNode->FunctionValues()(i,j)<<" ";
+                        }
+                        outputStream<<endl;
+                    }
+                }
+
+                numMBsRun++;
+                if (numMBsRun % m_numMBsToShowResult == 0)
+                {
+                    numSamples = (totalEpochSamples - prevTotalEpochSamples);
+                    crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
+                    evalError = epochEvalError - prevEpochEvalError;
+
+                    fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
+                            prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
+
+                    prevTotalEpochSamples = totalEpochSamples;
+                    prevEpochCrossEntropy = epochCrossEntropy;
+                    prevEpochEvalError = epochEvalError;
+                    prevStart = numMBsRun + 1;
+                }
+
+            }
+
+            // show final grouping of output
+            numSamples = totalEpochSamples - prevTotalEpochSamples;
+            if (numSamples > 0)
+            {
+                crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
+                evalError = epochEvalError - prevEpochEvalError;
+                fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
+                    prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
+            }
+
+            //final statistics
+            epochEvalError /= (ElemType)totalEpochSamples;
+            epochCrossEntropy /= (ElemType)totalEpochSamples;
+            fprintf(stderr, "Overall: Samples Evaluated = %lu   EvalErr Per Sample = %.8g   Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
+            if (outputStream.is_open())
+            {
+                outputStream.close();
+            }
+            evalSetCrossEntropy = epochCrossEntropy;
+            return epochEvalError;
+        }
+
+    protected:
+        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodePtr>& evalNodes, 
+            const vector<ElemType> & evalResults, const vector<ElemType> & evalResultsLastMBs)
+        {
+            fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);
+
+            for (size_t i=0; i<evalResults.size(); i++)
+            {
+                fprintf(stderr, "%ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+    protected: 
+        ComputationNetwork<ElemType>& m_net;
+        size_t m_numMBsToShowResult;
+        int m_traceLevel;
+        void operator=(const SimpleEvaluator&); // (not assignable)
+    };
+
+}}}
diff --git a/MachineLearning/cn/SynchronousExecutionEngine.h b/MachineLearning/cn/SynchronousExecutionEngine.h
index cebec5069..94ffea8bf 100644
--- a/MachineLearning/cn/SynchronousExecutionEngine.h
+++ b/MachineLearning/cn/SynchronousExecutionEngine.h
@@ -1,780 +1,780 @@
-//
-// <copyright file="SynchronousExecutionEngine.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "IExecutionEngine.h"
-#include "ComputationNetwork.h"
-#include "fileutil.h"   // for fexists()
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// SynchronousNodeEvaluator
-// Process the Network Description Language into a Computation Network useable
-// by SynchronousExecutionEngine.
-template <typename ElemType>
-class SynchronousNodeEvaluator : public NDLNodeEvaluator<ElemType>
-{
-public:
-    // Constructor - create evaluator
-    SynchronousNodeEvaluator(ComputationNetwork<ElemType>& cn) : m_net(cn)
-    { }
-
-    // Evaluate - evaluate a node and translate into underlying 
-    // node - node we are evaluating
-    // baseName - base name for all symbols at this level
-    // pass - NDLPass through the evaluation (0-initial, 1-resolve variables, 2-final)
-    virtual void Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
-    {
-        // constants don't need to be evaluated, they just translate into numbers...
-        if (node->GetType() == ndlTypeConstant 
-            || node->GetType() == ndlTypeArray)
-            return;
-
-        // setup the node parameters, where they start in the parameter list, and how many there are
-        // this is needed for the ndlPassResolve step to hookup all the inputs
-        int nodeParamStart = 0;
-        int nodeParamCount = 0;
-
-        // get the parameters
-        std::vector<NDLNode<ElemType>*> parameter = node->GetParameters();
-
-        // get the name for the symbol to be used by CN nodes
-        std::wstring name = msra::strfun::utf16(node->GetName());
-        if (!baseName.empty())
-        {
-            name = baseName + L"." + name;
-        }
-
-        std::wstring cnNodeType = msra::strfun::utf16(node->GetValue());
-
-        ComputationNodePtr nodePtr = nullptr;
-
-        // get the node pointer for the node, should be stored in the EvalValue;
-        if (pass > ndlPassInitial) 
-        {
-            nodePtr = (ComputationNodePtr)node->GetEvalValue();
-            if (nodePtr == nullptr)
-            {
-                nodePtr = (ComputationNodePtr)m_net.GetNodeFromName(name);
-                node->SetEvalValue(nodePtr);
-            }
-        }
-        
-        if (InputValue<ElemType>::TypeName() == cnNodeType)
-        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                // first look for this node already existing in the network
-                if (m_net.NodeNameExist(name))
-                    nodePtr = m_net.GetNodeFromName(name);
-                else
-                    nodePtr = m_net.CreateInputNode(name, rows, cols);
-            }
-        }
-        else if (SparseInputValue<ElemType>::TypeName() == cnNodeType)
-        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                // first look for this node already existing in the network
-                if (m_net.NodeNameExist(name))
-                    nodePtr = m_net.GetNodeFromName(name);
-                else
-                    nodePtr = m_net.CreateSparseInputNode(name, rows, cols);
-            }
-        }
-        else if (cnNodeType == L"ImageInput")
-        {
-            if (parameter.size() < 3 || parameter.size() > 4)
-                RuntimeError("%ws should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
-                size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
-                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
-
-                nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
-            }
-        }
-        else if (LearnableParameter<ElemType>::TypeName() == cnNodeType)
-        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                bool needGradient = node->GetOptionalParameter("needGradient", "true");
-
-                nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
-
-                nodePtr->NeedGradient() = needGradient;
-            }
-            else if (pass == ndlPassFinal)
-            {
-                static int randomSeed = 1;
-                std::string initString = node->GetOptionalParameter("init", "uniform");
-                ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
-                ElemType value = node->GetOptionalParameter("value", "0");
-                
-                msra::strfun::tolower_ascii (initString);
-                if (initString == "fixedvalue")
-                    nodePtr->FunctionValues().SetValue(value);
-                else if (initString == "uniform")
-                    m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
-                else if (initString == "gaussian")
-                    m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
-                else if (initString == "fromfile")
-                {
-                    std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
-                    if (initFromFilePath == "")
-                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
-                    if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"')
-                        // remove the opening and closing double quotes
-                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
-                    if(!fexists(initFromFilePath))
-                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
-                }
-                else
-                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
-            }
-        }
-        else if (SparseLearnableParameter<ElemType>::TypeName() == cnNodeType)
-        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                bool needGradient = node->GetOptionalParameter("needGradient", "true");
-
-                nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols);
-
-                nodePtr->NeedGradient() = needGradient;
-            }
-            else if (pass == ndlPassFinal)
-            {
-                static int randomSeed = 1;
-                std::string initString = node->GetOptionalParameter("init", "uniform");
-                ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
-                ElemType value = node->GetOptionalParameter("value", "0");
-                
-                msra::strfun::tolower_ascii(initString);
-                if (initString == "fixedvalue")
-                    nodePtr->FunctionValues().SetValue(value);
-                else if (initString == "uniform")
-                    m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
-                else if (initString == "gaussian")
-                    m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
-                else if (initString == "fromfile")
-                {
-                    std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
-                    if (initFromFilePath == "")
-                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
-                    if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"')
-                        // remove the opening and closing double quotes
-                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
-                    if(!fexists(initFromFilePath))
-                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
-                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
-                }
-                else
-                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
-            }
-        }
-        else if (cnNodeType == L"Constant")
-        {
-            if (parameter.size() != 1)
-                RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]].");
-
-            if (pass == ndlPassInitial)
-            {
-                size_t rows = node->GetOptionalParameter("rows", "1");
-                size_t cols = node->GetOptionalParameter("cols", "1");
-
-                nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
-                nodePtr->NeedGradient() = false;
-            }
-            else if (pass == ndlPassFinal)
-            {
-                ElemType val = parameter[0]->GetScalar();
-                nodePtr->FunctionValues().SetValue(val);
-            }
-        }
-        else if (cnNodeType == RowSliceNode<ElemType>::TypeName())
-        {
-
-            // setup the parameter position of children so we can hook them up later
-            nodeParamCount = 1;
-            // parameters are (rows, [cols], inputNode)
-            nodeParamStart = parameter.size() > 2?2:1;
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t start_index = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
-
-                bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name);
-                nodePtr->NeedGradient() = needGradient;
-
-            }
-        }
-        else if (cnNodeType == DelayNode<ElemType>::TypeName())
-        {
-            // setup the parameter position of children so we can hook them up later
-            nodeParamCount = 1;
-            // parameters are (rows, [cols], delayNode)
-            nodeParamStart = parameter.size() > 2?2:1;
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                // if we have three parameters the second is columns
-                size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1");
-                nodePtr = m_net.Delay(NULL, defaultHiddenActivity, rows, cols, name);
-                size_t delayTime = node->GetOptionalParameter("delayTime","1");
-                ((DelayNode<ElemType>*)nodePtr)->SetDelay(delayTime);
-
-                nodePtr->NeedGradient() = needGradient;
-            }
-        }    
-        else if (cnNodeType == ConvolutionNode<ElemType>::TypeName())
-        {
-            if (parameter.size() != 7)
-                RuntimeError("%ws should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str());
-
-            // setup the parameter position of children so we can hook them up later
-            nodeParamCount = 2;
-            nodeParamStart = 0;
-
-            if (pass == ndlPassInitial)
-            {
-                int id = 2; // skip weightNode and inputValueNode
-
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size()-id, pass);
-                id = 0; // reset counter because the params array starts at zero
-                size_t kernelWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t kernelHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-            
-                assert (id == 5);
-
-                //optional
-                bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
-                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
-
-
-                nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
-                    horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
-            }
-        }
-        else if (cnNodeType == MaxPoolingNode<ElemType>::TypeName())
-        {
-            if (parameter.size() != 5)
-                RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
-
-            // setup the parameter position of children so we can hook them up later
-            nodeParamCount = 1;
-            nodeParamStart = 0;
-
-            if (pass == ndlPassInitial)
-            {
-                int id = 1; // skip inputValueNode
-
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-                id = 0; // reset counter because the params array starts at zero
-                size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-            
-                assert (id == 4);
-
-                nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, 
-                            horizontalSubsample, verticalSubsample, name);
-            }
-        }
-        else if (cnNodeType == AveragePoolingNode<ElemType>::TypeName())
-        {
-            if (parameter.size() != 5)
-                RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
-
-            // setup the parameter position of children so we can hook them up later
-            nodeParamCount = 1;
-            nodeParamStart = 0;
-
-            if (pass == ndlPassInitial)
-            {
-                int id = 1; // skip inputValueNode
-
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-                id = 0; // reset counter because the params array starts at zero
-                size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-
-                assert (id == 4);
-
-                nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, 
-                            horizontalSubsample, verticalSubsample, name);
-            }
-        }
-        else
-        {
-
-            // setup the variables for node parameter processing
-            nodeParamCount = parameter.size(); // all parameters are nodes in standard nodes
-            nodeParamStart = 0;
-
-            if (pass == ndlPassInitial)
-            {
-                nodePtr = m_net.CreateComputationNode(node->GetValue(), name);
-            }
-        }
-
-        switch (pass)
-        {
-        case ndlPassInitial:
-            node->SetEvalValue(nodePtr);
-            // evaluate parameters
-            EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
-            break;
-        case ndlPassResolve:
-            {
-            std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
-
-            switch (inputs.size())
-            {
-            case 1:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
-                break;
-            case 2:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
-                break;
-            case 3:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
-                break;
-            case 4:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
-                break;
-            default:
-                if (nodeParamCount > 0)
-                    RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
-                break;
-            }
-
-            // process common optional parameters (like "tag");
-            ProcessOptionalParameters(node);
-            break;
-            }
-        case ndlPassFinal:
-            break;
-        }
-    }
-
-#ifdef LATER
-    // EvaluateDotName - Evaluate a dot name and resolve to target node
-    // node - NDLNode of the script
-    // nodeParam - NDLNode parameter we are evaluating
-    // baseName - name of the base node
-    // pass - which pass through the NDL nodes
-    // returns: the node that is the evaluated parameter
-    virtual NDLNode<ElemType>* EvaluateDotName(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseNameP, const NDLPass pass)
-
-    {
-        if (pass > ndlPassInitial && evaluateNode)
-        {
-            std::string name = nodeParam->GetName();
-            std::wstring wname = msra::strfun::utf16(name);
-            if (nodeParam->GetType() == ndlTypeDotParameter)
-            {
-                // When we see a variable of the form "A.B" in a macro, we need to resolve it to an actual node, by first constructing it's
-                // fully-qualified name. There are 2 possibilities: 
-                // 1) "A" was defined locally within the macro.  In this case, we must find the fully-qualified name of the node that this macro
-                //    call is being assigned to (eg, "C" in the example "C=Macro(X)"), and concatenate it's name with "A.B" (eg, "C.A.B").
-                // 2) "A" was passed in as a parameter to a macro.  In this case, we must find the fully-qualified name of the node that
-                //    was passed in as "A", and replace the "A" and "A.B" with this name.
-
-                // Consider the following example:
-                // NdlBLob=[
-                //      P=MacroCall1(...)
-                //      C=MacroCall2(P) 
-                // ]
-                // # MacroDefinition
-                // MacroCall2(X)
-                // { 
-                //      A=MacroCall3(...)
-                //      D=Times(A.B,X.B)}
-                // }
-                // 
-
-                // In this example, in the call D=Times(A.B,X.B), we need to resolve A.B and X.B appropriately.
-                // Specifically, "A.B" must be resolved to the fully qualified name "C.A.B", whereas "X.B" must be resolved to the fully qualified name "P.B".
-                // We then use this fully-qualified name to look up this node in the model (using "m_net.GetNodeFromName").
-
-                std::size_t firstDotPos = name.find_first_of(".");
-                if (firstDotPos == std::string::npos)
-                {
-                    LogicError("nodeParam of type \"ndlTypeDotParameter\" doesn't have a dot in its name: %s", name.c_str());
-                }
-
-                std::string nameBeforeDot = name.substr(0, firstDotPos);
-                std::string nameAfterDot = name.substr(firstDotPos + 1, name.size() - (firstDotPos + 1));
-
-                // look up if "nameBeforeDot" was a parameter to the macro.
-                NDLNode<ElemType>* resolvedParam = nodeParam->GetParentScript()->FindSymbol(nameBeforeDot);
-                if (resolvedParam != nullptr && resolvedParam->GetType() == ndlTypeMacroCall)
-                {
-                    // if "nameBeforeDot" was a parameter to the macro, builds it's fully qualified name by
-                    // replacing "nameBeforeDot" with the fully qualified name of the node passed in as the parameter.
-                    NDLScript<ElemType>* parentScript = resolvedParam->GetParentScript();
-                    baseName = parentScript->GetBaseName();
-                    std::wstring resolvedParamName = msra::strfun::utf16(resolvedParam->GetName());
-                    wname = baseName.empty() ?
-                        resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot) :
-                        baseName + L"." + resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot);
-                }
-                else if (!baseName.empty())
-                {
-                    // else, "nameBeforeDot" wasn't a parameter to the macro, so treat it as a local variable.
-                    wname = baseName + L"." + wname;
-                }
-            }
-            else if (!baseName.empty())
-            {
-                wname = baseName + L"." + wname;
-            }
-
-            // fully qualified names can be looked up in the model
-            if (m_net.NodeNameExist(wname))
-            {
-                void* np = (void*)m_net.GetNodeFromName(wname);
-                nodeParam->SetEvalValue(np);
-            }
-            // NOTE: there is a bug here, we allow an abbreviated node reference (i.e. L1.BFF) based on return values in NDL 
-            // when the actual full node reference that the computational network uses would be L1.BFF.FF.P, so that is what CN sees
-            // can we do the normal find symbol here to allow abbreviated node references?
-
-            // if we still didn't get a value, throw an error
-            if (nodeParam->GetEvalValue() == nullptr)
-            {
-                LogicError("Dot name could not be resolved '%s': should have a node named '%ls' in computational network\n", nodeParam->GetName().c_str(), name.c_str());
-            }
-        }
-        return nodeParam;
-    }
-#endif
-
-    // EvaluateParameter - Evaluate a parameter of a call
-    // node - NDLNode of the script
-    // nodeParam - NDLNode parameter we are evaluating
-    // baseName - name of the base node
-    // pass - which pass through the NDL nodes
-    // returns: the node that is the evaluated parameter
-    virtual NDLNode<ElemType>* EvaluateParameter(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseNameP, const NDLPass pass )
-    {
-        // get the parent script that includes the symbol table we are interested in
-        NDLScript<ElemType>* script = node->GetParentScript();
-        wstring baseName = baseNameP;
-        if (script == NULL)
-        {
-            std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName());
-            LogicError("no script for a parameter node in call to %ls\n", name.c_str());
-        }
-
-        // evaluate the parameter if we haven't yet, or if we are in the resolve pass (need to set the inputs)
-        bool evaluateNode = nodeParam->GetEvalValue() == NULL || pass == ndlPassResolve;
-        switch (nodeParam->GetType())
-        {
-        // if the node is a parameter then look it up in the symbol table
-        case ndlTypeUndetermined: // an undetermined parameter needs to be looked up again in the symbol table
-        case ndlTypeParameter:
-        {
-            // lookup the parameter
-            NDLNode<ElemType>* nodeResolve = script->FindSymbol(nodeParam->GetName());
-
-            // if we have resolved the name, no need to continue evaluation
-            if (!(pass == ndlPassResolve && nodeResolve && nodeParam->GetEvalValue() == nullptr))
-            {
-                break;
-            }
-            if (pass > ndlPassInitial && evaluateNode && nodeResolve)
-            {
-                std::string name = nodeResolve->GetName();
-                // we need to start from the parent script, because that is the namespace of the parameter being passed in
-                NDLScript<ElemType>* parentScript = nodeResolve->GetParentScript();
-                nodeResolve = parentScript->FindSymbol(name);
-
-                // if we still didn't get a value
-                if (nodeResolve == nullptr || nodeResolve->GetEvalValue() == nullptr)
-                {
-                    // check for the fully quantified name in the computation network
-                    // this is needed for MEL processing, since CN nodes names can be used as parameters in MEL
-                    std::wstring wname = msra::strfun::utf16(name);
-                    if (m_net.NodeNameExist(wname))
-                    {
-                        void* np = (void*)m_net.GetNodeFromName(wname);
-                        // if we don't have a resolve node, it's because the name didn't exist in NDL
-                        if (!nodeResolve)
-                            nodeResolve = nodeParam;
-                        nodeResolve->SetEvalValue(np);
-                    }
-                    else
-                    {
-                        RuntimeError("Parameter name could not be resolved '%s'\n", name.c_str());
-                    }
-                }
-            }
-            nodeParam = nodeResolve;
-            break;
-        }
-        case ndlTypeFunction:
-            if (evaluateNode)
-                Evaluate(nodeParam, baseName, pass);
-            break;
-        case ndlTypeMacroCall:
-            if (evaluateNode)
-                nodeParam->EvaluateMacro(*this, baseName, pass);
-            break;
-        // constants and variables are good as is
-        case ndlTypeConstant:
-        case ndlTypeVariable:
-                break;
-        // everything else is illegal as a parameter
-        default:
-            {
-                std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName());
-                RuntimeError("Invalid parameter (macro definitions and arrays not allowed), see call to %ls\n", name.c_str());
-            }
-            break;
-        }
-        return nodeParam;
-    }
-
-    // EvaluateParameters - Evaluate the parameters of a call
-    // node - NDLNode we are evaluating paramters for
-    // baseName - baseName for the current node
-    // nodeParamStart - starting parameter that contains a node
-    // nodeParamCount - ending parameter that contains a node
-    // pass - NDL pass we are evaluating
-    // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator
-    virtual std::vector<void*> EvaluateParameters(NDLNode<ElemType>* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass)
-    {
-        std::vector<void*> inputs;
-        std::vector<NDLNode<ElemType>*> parameter = node->GetParameters();
-        ConfigArray paramString = node->GetParamString();
-
-        if (parameter.size() < 1)
-        {
-            return inputs;
-        }
-        if (nodeParamStart + nodeParamCount > parameter.size())
-            throw logic_error("EvaluateParmeters: nodeParamters specified that do not exist");
-        size_t numChildren = nodeParamCount;
-        for (size_t i=0; i < numChildren;++i)
-        {
-            int index = i+nodeParamStart;
-            NDLNode<ElemType>* nodeParam = parameter[index];
-            std::wstring paramS = paramString[index];
-
-            // default base is same as current
-            std::wstring baseSymbol = baseName;
-
-            NDLNode<ElemType>* nodeResult = EvaluateParameter(node, nodeParam, baseSymbol, pass);
-            // look for a prefix here and set baseName appropriately
-
-            if (pass == ndlPassResolve)
-            {
-                void* np = nodeResult->GetEvalValue();
-                assert(np != nullptr);
-                inputs.push_back((void*)np);
-            }
-            else if (pass == ndlPassInitial) // for initial pass we are only interested in resolved nodes (to get constant values)
-            {
-                inputs.push_back((void*)nodeResult);
-            }
-            // NOTE: in final pass inputs are always NULL
-        }
-
-        // now return the vector
-        return inputs;
-    }
-
-    // ProcessOptionalParameters - Process the optional parameters of a node
-    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
-    {
-        vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
-        ComputationNode<ElemType>* compNode = (ComputationNode<ElemType>*)node->GetEvalValue();
-        std::string empty;
-
-        // loop through all the optional parameters processing them as necessary
-        for (NDLNode<ElemType>* param : params)
-        {
-            // make sure it's a "tag" optional parameter, that's all we process currently
-            if (_stricmp(param->GetName().c_str(), "tag"))
-                continue;
-
-            std::string value = param->GetValue();
-            if (!_stricmp(value.c_str(), "feature"))
-            {
-                SetOutputNode(m_net.FeatureNodes(), compNode);
-            }
-            else if (!_stricmp(value.c_str(), "label"))
-            {
-                SetOutputNode(m_net.LabelNodes(), compNode);
-            }
-            else if (!_stricmp(value.c_str(), "criteria"))
-            {
-                SetOutputNode(m_net.FinalCriterionNodes(), compNode);
-            }
-            else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
-            {
-                SetOutputNode(m_net.EvaluationNodes(), compNode);
-            }
-            else if (!_stricmp(value.c_str(), "output"))
-            {
-                SetOutputNode(m_net.OutputNodes(), compNode);
-            }
-        }
-
-    }
-
-    // SetOutputNode - Set the output node, checks to see if it already exists first
-    // nodeGroup - group vector to add to
-    // compNode - computation node to add
-    void SetOutputNode(std::vector<ComputationNode<ElemType>*>& nodeGroup, ComputationNode<ElemType>* compNode)
-    {
-        for (ComputationNodePtr node : nodeGroup)
-        {
-            if (node == compNode)
-                return;
-        }
-        nodeGroup.push_back(compNode);
-    }
-
-    // FindSymbol - Search the nodes for a fully quantified symbol
-    // symbol - name of the symbol fully quantified name with "dots"
-    // returns - pointer to the matching EvalValue for that node, of NULL if not found
-    virtual void* FindSymbol(const wstring& symbol)
-    {
-        if (m_net.NodeNameExist(symbol))
-            return m_net.GetNodeFromName(symbol);
-        return NULL;
-    }
-
-    virtual ~SynchronousNodeEvaluator()
-    {
-    }
-
-private:
-    ComputationNetwork<ElemType>& m_net;
-    typedef ComputationNode<ElemType>* ComputationNodePtr;
-    void operator=(const SynchronousNodeEvaluator&);
-};
-
-// SynchronousExecutionEngine
-// TODO JC Refactor eligible methods and members into abstract base class.
-template <typename ElemType>
-class SynchronousExecutionEngine : public IExecutionEngine<ElemType>
-{
-public:
-    SynchronousExecutionEngine(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, unsigned long randomSeedOffset=0)
-    {
-        m_computationNetwork = new ComputationNetwork<ElemType>(deviceId);
-        m_computationNetwork->SetRandomSeedOffset(randomSeedOffset);
-        m_ownNetwork = true;
-        m_nodeEvaluator = new SynchronousNodeEvaluator<ElemType>(*m_computationNetwork);
-    }
-
-    SynchronousExecutionEngine(ComputationNetwork<ElemType>* computationNetwork)
-    {
-        m_computationNetwork = computationNetwork;
-        m_ownNetwork = false;
-        m_nodeEvaluator = new SynchronousNodeEvaluator<ElemType>(*m_computationNetwork);
-    }
-
-    virtual ~SynchronousExecutionEngine()
-    { 
-        if (m_ownNetwork)
-            delete m_computationNetwork;
-        delete m_nodeEvaluator;
-    }
-
-    ComputationNetwork<ElemType>& GetComputationNetwork()
-    {
-        return *m_computationNetwork;
-    }
-
-    NDLNodeEvaluator<ElemType>& GetNodeEvaluator()
-    {
-        return *m_nodeEvaluator;
-    }
-
-private:
-    bool m_ownNetwork;
-    ComputationNetwork<ElemType>* m_computationNetwork;
-    SynchronousNodeEvaluator<ElemType>* m_nodeEvaluator;
-protected:
-    // Copy constructor, should never be called.
-    SynchronousExecutionEngine(const SynchronousExecutionEngine<ElemType>& /*deepCopyFrom*/) 
-    {         
-        throw std::logic_error("'SynchronousExecutionEngine(const SynchronousExecutionEngine<ElemType>& deepCopyFrom)' should never be called.");
-    } 
-
-    // Assignment operator, should never be called.
-    SynchronousExecutionEngine<ElemType>& operator=(const SynchronousExecutionEngine<ElemType>& /*deepCopyFrom*/) 
-    {            
-        throw std::logic_error("'SynchronousExecutionEngine<ElemType>& operator=(const SynchronousExecutionEngine<ElemType>& deepCopyFrom)' should never be called.");
-    } 
-};
-
-template class SynchronousExecutionEngine<float>; 
-template class SynchronousExecutionEngine<double>;
-
+//
+// <copyright file="SynchronousExecutionEngine.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+
+#include "IExecutionEngine.h"
+#include "ComputationNetwork.h"
+#include "fileutil.h"   // for fexists()
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// SynchronousNodeEvaluator
+// Process the Network Description Language into a Computation Network useable
+// by SynchronousExecutionEngine.
+template <typename ElemType>
+class SynchronousNodeEvaluator : public NDLNodeEvaluator<ElemType>
+{
+public:
+    // Constructor - create evaluator
+    SynchronousNodeEvaluator(ComputationNetwork<ElemType>& cn) : m_net(cn)
+    { }
+
+    // Evaluate - evaluate a node and translate into underlying 
+    // node - node we are evaluating
+    // baseName - base name for all symbols at this level
+    // pass - NDLPass through the evaluation (0-initial, 1-resolve variables, 2-final)
+    virtual void Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
+    {
+        // constants don't need to be evaluated, they just translate into numbers...
+        if (node->GetType() == ndlTypeConstant 
+            || node->GetType() == ndlTypeArray)
+            return;
+
+        // setup the node parameters, where they start in the parameter list, and how many there are
+        // this is needed for the ndlPassResolve step to hookup all the inputs
+        int nodeParamStart = 0;
+        int nodeParamCount = 0;
+
+        // get the parameters
+        std::vector<NDLNode<ElemType>*> parameter = node->GetParameters();
+
+        // get the name for the symbol to be used by CN nodes
+        std::wstring name = msra::strfun::utf16(node->GetName());
+        if (!baseName.empty())
+        {
+            name = baseName + L"." + name;
+        }
+
+        std::wstring cnNodeType = msra::strfun::utf16(node->GetValue());
+
+        ComputationNodePtr nodePtr = nullptr;
+
+        // get the node pointer for the node, should be stored in the EvalValue;
+        if (pass > ndlPassInitial) 
+        {
+            nodePtr = (ComputationNodePtr)node->GetEvalValue();
+            if (nodePtr == nullptr)
+            {
+                nodePtr = (ComputationNodePtr)m_net.GetNodeFromName(name);
+                node->SetEvalValue(nodePtr);
+            }
+        }
+        
+        if (InputValue<ElemType>::TypeName() == cnNodeType)
+        {
+            if (parameter.size() < 1 || parameter.size() > 2)
+                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                // first look for this node already existing in the network
+                if (m_net.NodeNameExist(name))
+                    nodePtr = m_net.GetNodeFromName(name);
+                else
+                    nodePtr = m_net.CreateInputNode(name, rows, cols);
+            }
+        }
+        else if (SparseInputValue<ElemType>::TypeName() == cnNodeType)
+        {
+            if (parameter.size() < 1 || parameter.size() > 2)
+                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                // first look for this node already existing in the network
+                if (m_net.NodeNameExist(name))
+                    nodePtr = m_net.GetNodeFromName(name);
+                else
+                    nodePtr = m_net.CreateSparseInputNode(name, rows, cols);
+            }
+        }
+        else if (cnNodeType == L"ImageInput")
+        {
+            if (parameter.size() < 3 || parameter.size() > 4)
+                RuntimeError("%ws should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
+                size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
+                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
+
+                nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages);
+            }
+        }
+        else if (LearnableParameter<ElemType>::TypeName() == cnNodeType)
+        {
+            if (parameter.size() < 1 || parameter.size() > 2)
+                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                bool needGradient = node->GetOptionalParameter("needGradient", "true");
+
+                nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
+
+                nodePtr->NeedGradient() = needGradient;
+            }
+            else if (pass == ndlPassFinal)
+            {
+                static int randomSeed = 1;
+                std::string initString = node->GetOptionalParameter("init", "uniform");
+                ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
+                ElemType value = node->GetOptionalParameter("value", "0");
+                
+                msra::strfun::tolower_ascii (initString);
+                if (initString == "fixedvalue")
+                    nodePtr->FunctionValues().SetValue(value);
+                else if (initString == "uniform")
+                    m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
+                else if (initString == "gaussian")
+                    m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
+                else if (initString == "fromfile")
+                {
+                    std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
+                    if (initFromFilePath == "")
+                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
+                    if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"')
+                        // remove the opening and closing double quotes
+                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
+                    if(!fexists(initFromFilePath))
+                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
+                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                }
+                else
+                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
+            }
+        }
+        else if (SparseLearnableParameter<ElemType>::TypeName() == cnNodeType)
+        {
+            if (parameter.size() < 1 || parameter.size() > 2)
+                RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                bool needGradient = node->GetOptionalParameter("needGradient", "true");
+
+                nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols);
+
+                nodePtr->NeedGradient() = needGradient;
+            }
+            else if (pass == ndlPassFinal)
+            {
+                static int randomSeed = 1;
+                std::string initString = node->GetOptionalParameter("init", "uniform");
+                ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1");
+                ElemType value = node->GetOptionalParameter("value", "0");
+                
+                msra::strfun::tolower_ascii(initString);
+                if (initString == "fixedvalue")
+                    nodePtr->FunctionValues().SetValue(value);
+                else if (initString == "uniform")
+                    m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale);
+                else if (initString == "gaussian")
+                    m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale);
+                else if (initString == "fromfile")
+                {
+                    std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
+                    if (initFromFilePath == "")
+                        RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method");
+                    if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"')
+                        // remove the opening and closing double quotes
+                        initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2);
+                    if(!fexists(initFromFilePath))
+                        RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str());
+                    m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath);
+                }
+                else
+                    RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]");
+            }
+        }
+        else if (cnNodeType == L"Constant")
+        {
+            if (parameter.size() != 1)
+                RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]].");
+
+            if (pass == ndlPassInitial)
+            {
+                size_t rows = node->GetOptionalParameter("rows", "1");
+                size_t cols = node->GetOptionalParameter("cols", "1");
+
+                nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
+                nodePtr->NeedGradient() = false;
+            }
+            else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
+            {
+                ElemType val = parameter[0]->GetScalar();
+                nodePtr->FunctionValues().SetValue(val);
+            }
+        }
+        else if (cnNodeType == RowSliceNode<ElemType>::TypeName())
+        {
+
+            // setup the parameter position of children so we can hook them up later
+            nodeParamCount = 1;
+            // parameters are (rows, [cols], inputNode)
+            nodeParamStart = parameter.size() > 2?2:1;
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t start_index = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                size_t num_rows = ((NDLNode<ElemType>*)params[1])->GetScalar();
+
+                bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name);
+                nodePtr->NeedGradient() = needGradient;
+
+            }
+        }
+        else if (cnNodeType == DelayNode<ElemType>::TypeName())
+        {
+            // setup the parameter position of children so we can hook them up later
+            nodeParamCount = 1;
+            // parameters are (rows, [cols], delayNode)
+            nodeParamStart = parameter.size() > 2?2:1;
+
+            if (pass == ndlPassInitial)
+            {
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
+                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
+                // if we have three parameters the second is columns
+                size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+
+                bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1");
+                nodePtr = m_net.Delay(NULL, defaultHiddenActivity, rows, cols, name);
+                size_t delayTime = node->GetOptionalParameter("delayTime","1");
+                ((DelayNode<ElemType>*)nodePtr)->SetDelay(delayTime);
+
+                nodePtr->NeedGradient() = needGradient;
+            }
+        }    
+        else if (cnNodeType == ConvolutionNode<ElemType>::TypeName())
+        {
+            if (parameter.size() != 7)
+                RuntimeError("%ws should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str());
+
+            // setup the parameter position of children so we can hook them up later
+            nodeParamCount = 2;
+            nodeParamStart = 0;
+
+            if (pass == ndlPassInitial)
+            {
+                int id = 2; // skip weightNode and inputValueNode
+
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size()-id, pass);
+                id = 0; // reset counter because the params array starts at zero
+                size_t kernelWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t kernelHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+            
+                assert (id == 5);
+
+                //optional
+                bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
+                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
+
+
+                nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
+                    horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
+            }
+        }
+        else if (cnNodeType == MaxPoolingNode<ElemType>::TypeName())
+        {
+            if (parameter.size() != 5)
+                RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
+
+            // setup the parameter position of children so we can hook them up later
+            nodeParamCount = 1;
+            nodeParamStart = 0;
+
+            if (pass == ndlPassInitial)
+            {
+                int id = 1; // skip inputValueNode
+
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                id = 0; // reset counter because the params array starts at zero
+                size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+            
+                assert (id == 4);
+
+                nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, 
+                            horizontalSubsample, verticalSubsample, name);
+            }
+        }
+        else if (cnNodeType == AveragePoolingNode<ElemType>::TypeName())
+        {
+            if (parameter.size() != 5)
+                RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str());
+
+            // setup the parameter position of children so we can hook them up later
+            nodeParamCount = 1;
+            nodeParamStart = 0;
+
+            if (pass == ndlPassInitial)
+            {
+                int id = 1; // skip inputValueNode
+
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                id = 0; // reset counter because the params array starts at zero
+                size_t windowWidth = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+
+                assert (id == 4);
+
+                nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, 
+                            horizontalSubsample, verticalSubsample, name);
+            }
+        }
+        else
+        {
+
+            // setup the variables for node parameter processing
+            nodeParamCount = parameter.size(); // all parameters are nodes in standard nodes
+            nodeParamStart = 0;
+
+            if (pass == ndlPassInitial)
+            {
+                nodePtr = m_net.CreateComputationNode(node->GetValue(), name);
+            }
+        }
+
+        switch (pass)
+        {
+        case ndlPassInitial:
+            node->SetEvalValue(nodePtr);
+            // evaluate parameters
+            EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
+            break;
+        case ndlPassResolve:
+            {
+            std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
+
+            switch (inputs.size())
+            {
+            case 1:
+                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
+                break;
+            case 2:
+                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
+                break;
+            case 3:
+                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
+                break;
+            case 4:
+                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
+                break;
+            default:
+                if (nodeParamCount > 0)
+                    RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
+                break;
+            }
+
+            // process common optional parameters (like "tag");
+            ProcessOptionalParameters(node);
+            break;
+            }
+        case ndlPassFinal:
+            break;
+        }
+    }
+
+#ifdef LATER
+    // EvaluateDotName - Evaluate a dot name and resolve to target node
+    // node - NDLNode of the script
+    // nodeParam - NDLNode parameter we are evaluating
+    // baseName - name of the base node
+    // pass - which pass through the NDL nodes
+    // returns: the node that is the evaluated parameter
+    virtual NDLNode<ElemType>* EvaluateDotName(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseNameP, const NDLPass pass)
+
+    {
+        if (pass > ndlPassInitial && evaluateNode)
+        {
+            std::string name = nodeParam->GetName();
+            std::wstring wname = msra::strfun::utf16(name);
+            if (nodeParam->GetType() == ndlTypeDotParameter)
+            {
+                // When we see a variable of the form "A.B" in a macro, we need to resolve it to an actual node, by first constructing it's
+                // fully-qualified name. There are 2 possibilities: 
+                // 1) "A" was defined locally within the macro.  In this case, we must find the fully-qualified name of the node that this macro
+                //    call is being assigned to (eg, "C" in the example "C=Macro(X)"), and concatenate it's name with "A.B" (eg, "C.A.B").
+                // 2) "A" was passed in as a parameter to a macro.  In this case, we must find the fully-qualified name of the node that
+                //    was passed in as "A", and replace the "A" and "A.B" with this name.
+
+                // Consider the following example:
+                // NdlBLob=[
+                //      P=MacroCall1(...)
+                //      C=MacroCall2(P) 
+                // ]
+                // # MacroDefinition
+                // MacroCall2(X)
+                // { 
+                //      A=MacroCall3(...)
+                //      D=Times(A.B,X.B)}
+                // }
+                // 
+
+                // In this example, in the call D=Times(A.B,X.B), we need to resolve A.B and X.B appropriately.
+                // Specifically, "A.B" must be resolved to the fully qualified name "C.A.B", whereas "X.B" must be resolved to the fully qualified name "P.B".
+                // We then use this fully-qualified name to look up this node in the model (using "m_net.GetNodeFromName").
+
+                std::size_t firstDotPos = name.find_first_of(".");
+                if (firstDotPos == std::string::npos)
+                {
+                    LogicError("nodeParam of type \"ndlTypeDotParameter\" doesn't have a dot in its name: %s", name.c_str());
+                }
+
+                std::string nameBeforeDot = name.substr(0, firstDotPos);
+                std::string nameAfterDot = name.substr(firstDotPos + 1, name.size() - (firstDotPos + 1));
+
+                // look up if "nameBeforeDot" was a parameter to the macro.
+                NDLNode<ElemType>* resolvedParam = nodeParam->GetParentScript()->FindSymbol(nameBeforeDot);
+                if (resolvedParam != nullptr && resolvedParam->GetType() == ndlTypeMacroCall)
+                {
+                    // if "nameBeforeDot" was a parameter to the macro, builds it's fully qualified name by
+                    // replacing "nameBeforeDot" with the fully qualified name of the node passed in as the parameter.
+                    NDLScript<ElemType>* parentScript = resolvedParam->GetParentScript();
+                    baseName = parentScript->GetBaseName();
+                    std::wstring resolvedParamName = msra::strfun::utf16(resolvedParam->GetName());
+                    wname = baseName.empty() ?
+                        resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot) :
+                        baseName + L"." + resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot);
+                }
+                else if (!baseName.empty())
+                {
+                    // else, "nameBeforeDot" wasn't a parameter to the macro, so treat it as a local variable.
+                    wname = baseName + L"." + wname;
+                }
+            }
+            else if (!baseName.empty())
+            {
+                wname = baseName + L"." + wname;
+            }
+
+            // fully qualified names can be looked up in the model
+            if (m_net.NodeNameExist(wname))
+            {
+                void* np = (void*)m_net.GetNodeFromName(wname);
+                nodeParam->SetEvalValue(np);
+            }
+            // NOTE: there is a bug here, we allow an abbreviated node reference (i.e. L1.BFF) based on return values in NDL 
+            // when the actual full node reference that the computational network uses would be L1.BFF.FF.P, so that is what CN sees
+            // can we do the normal find symbol here to allow abbreviated node references?
+
+            // if we still didn't get a value, throw an error
+            if (nodeParam->GetEvalValue() == nullptr)
+            {
+                LogicError("Dot name could not be resolved '%s': should have a node named '%ls' in computational network\n", nodeParam->GetName().c_str(), name.c_str());
+            }
+        }
+        return nodeParam;
+    }
+#endif
+
+    // EvaluateParameter - Evaluate a parameter of a call
+    // node - NDLNode of the script
+    // nodeParam - NDLNode parameter we are evaluating
+    // baseName - name of the base node
+    // pass - which pass through the NDL nodes
+    // returns: the node that is the evaluated parameter
+    virtual NDLNode<ElemType>* EvaluateParameter(NDLNode<ElemType>* node, NDLNode<ElemType>* nodeParam, const std::wstring& baseNameP, const NDLPass pass )
+    {
+        // get the parent script that includes the symbol table we are interested in
+        NDLScript<ElemType>* script = node->GetParentScript();
+        wstring baseName = baseNameP;
+        if (script == NULL)
+        {
+            std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName());
+            LogicError("no script for a parameter node in call to %ls\n", name.c_str());
+        }
+
+        // evaluate the parameter if we haven't yet, or if we are in the resolve pass (need to set the inputs)
+        bool evaluateNode = nodeParam->GetEvalValue() == NULL || pass == ndlPassResolve;
+        switch (nodeParam->GetType())
+        {
+        // if the node is a parameter then look it up in the symbol table
+        case ndlTypeUndetermined: // an undetermined parameter needs to be looked up again in the symbol table
+        case ndlTypeParameter:
+        {
+            // lookup the parameter
+            NDLNode<ElemType>* nodeResolve = script->FindSymbol(nodeParam->GetName());
+
+            // if we have resolved the name, no need to continue evaluation
+            if (!(pass == ndlPassResolve && nodeResolve && nodeParam->GetEvalValue() == nullptr))
+            {
+                break;
+            }
+            if (pass > ndlPassInitial && evaluateNode && nodeResolve)
+            {
+                std::string name = nodeResolve->GetName();
+                // we need to start from the parent script, because that is the namespace of the parameter being passed in
+                NDLScript<ElemType>* parentScript = nodeResolve->GetParentScript();
+                nodeResolve = parentScript->FindSymbol(name);
+
+                // if we still didn't get a value
+                if (nodeResolve == nullptr || nodeResolve->GetEvalValue() == nullptr)
+                {
+                    // check for the fully quantified name in the computation network
+                    // this is needed for MEL processing, since CN nodes names can be used as parameters in MEL
+                    std::wstring wname = msra::strfun::utf16(name);
+                    if (m_net.NodeNameExist(wname))
+                    {
+                        void* np = (void*)m_net.GetNodeFromName(wname);
+                        // if we don't have a resolve node, it's because the name didn't exist in NDL
+                        if (!nodeResolve)
+                            nodeResolve = nodeParam;
+                        nodeResolve->SetEvalValue(np);
+                    }
+                    else
+                    {
+                        RuntimeError("Parameter name could not be resolved '%s'\n", name.c_str());
+                    }
+                }
+            }
+            nodeParam = nodeResolve;
+            break;
+        }
+        case ndlTypeFunction:
+            if (evaluateNode)
+                Evaluate(nodeParam, baseName, pass);
+            break;
+        case ndlTypeMacroCall:
+            if (evaluateNode)
+                nodeParam->EvaluateMacro(*this, baseName, pass);
+            break;
+        // constants and variables are good as is
+        case ndlTypeConstant:
+        case ndlTypeVariable:
+                break;
+        // everything else is illegal as a parameter
+        default:
+            {
+                std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName());
+                RuntimeError("Invalid parameter (macro definitions and arrays not allowed), see call to %ls\n", name.c_str());
+            }
+            break;
+        }
+        return nodeParam;
+    }
+
+    // EvaluateParameters - Evaluate the parameters of a call
+    // node - NDLNode we are evaluating paramters for
+    // baseName - baseName for the current node
+    // nodeParamStart - starting parameter that contains a node
+    // nodeParamCount - ending parameter that contains a node
+    // pass - NDL pass we are evaluating
+    // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator
+    virtual std::vector<void*> EvaluateParameters(NDLNode<ElemType>* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass)
+    {
+        std::vector<void*> inputs;
+        std::vector<NDLNode<ElemType>*> parameter = node->GetParameters();
+        ConfigArray paramString = node->GetParamString();
+
+        if (parameter.size() < 1)
+        {
+            return inputs;
+        }
+        if (nodeParamStart + nodeParamCount > parameter.size())
+            throw logic_error("EvaluateParmeters: nodeParamters specified that do not exist");
+        size_t numChildren = nodeParamCount;
+        for (size_t i=0; i < numChildren;++i)
+        {
+            int index = i+nodeParamStart;
+            NDLNode<ElemType>* nodeParam = parameter[index];
+            std::wstring paramS = paramString[index];
+
+            // default base is same as current
+            std::wstring baseSymbol = baseName;
+
+            NDLNode<ElemType>* nodeResult = EvaluateParameter(node, nodeParam, baseSymbol, pass);
+            // look for a prefix here and set baseName appropriately
+
+            if (pass == ndlPassResolve)
+            {
+                void* np = nodeResult->GetEvalValue();
+                assert(np != nullptr);
+                inputs.push_back((void*)np);
+            }
+            else if (pass == ndlPassInitial) // for initial pass we are only interested in resolved nodes (to get constant values)
+            {
+                inputs.push_back((void*)nodeResult);
+            }
+            // NOTE: in final pass inputs are always NULL
+        }
+
+        // now return the vector
+        return inputs;
+    }
+
+    // ProcessOptionalParameters - Process the optional parameters of a node
+    virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
+    {
+        vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
+        ComputationNode<ElemType>* compNode = (ComputationNode<ElemType>*)node->GetEvalValue();
+        std::string empty;
+
+        // loop through all the optional parameters processing them as necessary
+        for (NDLNode<ElemType>* param : params)
+        {
+            // make sure it's a "tag" optional parameter, that's all we process currently
+            if (_stricmp(param->GetName().c_str(), "tag"))
+                continue;
+
+            std::string value = param->GetValue();
+            if (!_stricmp(value.c_str(), "feature"))
+            {
+                SetOutputNode(m_net.FeatureNodes(), compNode);
+            }
+            else if (!_stricmp(value.c_str(), "label"))
+            {
+                SetOutputNode(m_net.LabelNodes(), compNode);
+            }
+            else if (!_stricmp(value.c_str(), "criteria"))
+            {
+                SetOutputNode(m_net.FinalCriterionNodes(), compNode);
+            }
+            else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
+            {
+                SetOutputNode(m_net.EvaluationNodes(), compNode);
+            }
+            else if (!_stricmp(value.c_str(), "output"))
+            {
+                SetOutputNode(m_net.OutputNodes(), compNode);
+            }
+        }
+
+    }
+
+    // SetOutputNode - Set the output node, checks to see if it already exists first
+    // nodeGroup - group vector to add to
+    // compNode - computation node to add
+    void SetOutputNode(std::vector<ComputationNode<ElemType>*>& nodeGroup, ComputationNode<ElemType>* compNode)
+    {
+        for (ComputationNodePtr node : nodeGroup)
+        {
+            if (node == compNode)
+                return;
+        }
+        nodeGroup.push_back(compNode);
+    }
+
+    // FindSymbol - Search the nodes for a fully quantified symbol
+    // symbol - name of the symbol fully quantified name with "dots"
+    // returns - pointer to the matching EvalValue for that node, of NULL if not found
+    virtual void* FindSymbol(const wstring& symbol)
+    {
+        if (m_net.NodeNameExist(symbol))
+            return m_net.GetNodeFromName(symbol);
+        return NULL;
+    }
+
+    virtual ~SynchronousNodeEvaluator()
+    {
+    }
+
+private:
+    ComputationNetwork<ElemType>& m_net;
+    typedef ComputationNode<ElemType>* ComputationNodePtr;
+    void operator=(const SynchronousNodeEvaluator&);
+};
+
+// SynchronousExecutionEngine
+// TODO JC Refactor eligible methods and members into abstract base class.
+template <typename ElemType>
+class SynchronousExecutionEngine : public IExecutionEngine<ElemType>
+{
+public:
+    SynchronousExecutionEngine(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, unsigned long randomSeedOffset=0)
+    {
+        m_computationNetwork = new ComputationNetwork<ElemType>(deviceId);
+        m_computationNetwork->SetRandomSeedOffset(randomSeedOffset);
+        m_ownNetwork = true;
+        m_nodeEvaluator = new SynchronousNodeEvaluator<ElemType>(*m_computationNetwork);
+    }
+
+    SynchronousExecutionEngine(ComputationNetwork<ElemType>* computationNetwork)
+    {
+        m_computationNetwork = computationNetwork;
+        m_ownNetwork = false;
+        m_nodeEvaluator = new SynchronousNodeEvaluator<ElemType>(*m_computationNetwork);
+    }
+
+    virtual ~SynchronousExecutionEngine()
+    { 
+        if (m_ownNetwork)
+            delete m_computationNetwork;
+        delete m_nodeEvaluator;
+    }
+
+    ComputationNetwork<ElemType>& GetComputationNetwork()
+    {
+        return *m_computationNetwork;
+    }
+
+    NDLNodeEvaluator<ElemType>& GetNodeEvaluator()
+    {
+        return *m_nodeEvaluator;
+    }
+
+private:
+    bool m_ownNetwork;
+    ComputationNetwork<ElemType>* m_computationNetwork;
+    SynchronousNodeEvaluator<ElemType>* m_nodeEvaluator;
+protected:
+    // Copy constructor, should never be called.
+    SynchronousExecutionEngine(const SynchronousExecutionEngine<ElemType>& /*deepCopyFrom*/) 
+    {         
+        throw std::logic_error("'SynchronousExecutionEngine(const SynchronousExecutionEngine<ElemType>& deepCopyFrom)' should never be called.");
+    } 
+
+    // Assignment operator, should never be called.
+    SynchronousExecutionEngine<ElemType>& operator=(const SynchronousExecutionEngine<ElemType>& /*deepCopyFrom*/) 
+    {            
+        throw std::logic_error("'SynchronousExecutionEngine<ElemType>& operator=(const SynchronousExecutionEngine<ElemType>& deepCopyFrom)' should never be called.");
+    } 
+};
+
+template class SynchronousExecutionEngine<float>; 
+template class SynchronousExecutionEngine<double>;
+
 }}}
\ No newline at end of file
diff --git a/MachineLearning/cn/TrainingCriterionNode.h b/MachineLearning/cn/TrainingCriterionNode.h
index 16709ec71..c962870a3 100644
--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
@@ -1,1245 +1,1245 @@
-//
-// <copyright file="TrainingCriterionNode.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-#include <stdexcept>
-#include <list>
-#include <memory>
-#include "ComputationNode.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-    //note: to save computation the gradient may be scaled by an constant. 
-
-    template<class ElemType>
-    class SquareErrorNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        SquareErrorNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_leftMinusRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        SquareErrorNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_leftMinusRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-                
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"SquareError";} 
-
-        virtual void ComputeInputPartial(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                throw std::invalid_argument("SquareError criteria only takes two inputs.");
-
-            //left Node must be a scalar
-            if (inputIndex == 0)  //left derivative
-            {
-                ComputeInputPartialLeft(Inputs(0)->GradientValues(), GradientValues(), m_leftMinusRight);
-            }
-            else
-            {
-                ComputeInputPartialRight(Inputs(1)->GradientValues(), GradientValues(), m_leftMinusRight);
-            }
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/)
-        {
-            throw std::logic_error("SquareError node should never be in a loop.");
-        }
-
-        static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& leftMinusRight)  
-        {
-            inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), leftMinusRight);
-        }
-
-        static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& leftMinusRight)  
-        {
-            inputGradientValues.AddWithScaleOf(-gradientValues.Get00Element(), leftMinusRight);
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->GradientParam();
-                descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsInput);
-                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam(1, paramOptionsInput);
-                descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsOutput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-        virtual void EvaluateThisNode()  
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_leftMinusRight);
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
-        {
-            throw std::logic_error("SquareError node should never be in a loop.");
-        }
-
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& leftMinusRight)  
-        {
-            leftMinusRight.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1);
-            ElemType v = leftMinusRight.FrobeniusNorm();
-            functionValues.Resize(1,1);
-            functionValues.SetValue(v*v/2);
-#if NANCHECK
-            functionValues.HasNan("SquareError");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 2) 
-                throw std::logic_error("SquareError operation requires two inputs.");
-
-            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
-                throw std::logic_error("SquareError operation: one of the operants has 0 element.");
-
-            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
-                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
-            {
-                throw std::logic_error("The Matrix dimension in the SquareError operation does not match.");
-            }       
-
-            FunctionValues().Resize(1,1);
-            m_leftMinusRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-            CopyImageSizeFromInputs(); 
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;        
-        }       
-
-        virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode) 
-        {
-            m_children.resize(2);
-            m_children[0] = leftNode;
-            m_children[1] = rightNode;
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_leftMinusRight.GetDeviceId() != deviceId)
-                    m_leftMinusRight.TransferFromDeviceToDevice(m_leftMinusRight.GetDeviceId(), deviceId,true);
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            SquareErrorNode<ElemType>* node = (SquareErrorNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_leftMinusRight = m_leftMinusRight;
-            }
-        }
-
-        // copy constructor
-        SquareErrorNode(const SquareErrorNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_leftMinusRight(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new SquareErrorNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-    private:
-        Matrix<ElemType> m_leftMinusRight;
-    };
-
-    template class SquareErrorNode<float>; 
-    template class SquareErrorNode<double>;
-
-    //calculates: -sum(left_i * log(softmax_i(right)))
-    template<class ElemType>
-    class CrossEntropyWithSoftmaxNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        CrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        CrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-                
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"CrossEntropyWithSoftmax";} 
-
-        virtual void ComputeInputPartial(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                throw std::invalid_argument("CrossEntropyWithSoftmaxNode criterion only takes two inputs.");
-
-            //left Node must be a scalar
-            if (inputIndex == 0)  //left derivative
-            {
-                ComputeInputPartialLeft(m_logSoftmaxOfRight, Inputs(inputIndex)->GradientValues(), GradientValues());
-            }
-            else
-            {
-                ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
-            }
-
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop.");
-        }
-
-        static void WINAPI ComputeInputPartialLeft(const Matrix<ElemType>& logSoftmaxOfRight, Matrix<ElemType>& inputGradientValues, 
-            const Matrix<ElemType>& gradientValues)  
-        {
-#if DUMPOUTPUT
-            logSoftmaxOfRight.Print("CrossEntropyWithSoftmax Partial-logSoftmaxOfRight");
-            gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues");
-            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-in");
-#endif
-
-            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues);
-#if DUMPOUTPUT
-            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-out");
-#endif
-
-        }
-
-        static void WINAPI ComputeInputPartialRight(const Matrix<ElemType>& softmaxOfRight, const Matrix<ElemType>& inputFunctionValues, 
-            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
-        {
-#if DUMPOUTPUT
-            softmaxOfRight.Print("CrossEntropyWithSoftmax Partial-softmaxOfRight");
-            inputFunctionValues.Print("CrossEntropyWithSoftmax Partial-inputFunctionValues");
-            gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues");
-            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right-in");
-#endif
-
-            Matrix<ElemType>::AddScaledDifference(gradientValues, softmaxOfRight, inputFunctionValues, inputGradientValues);
-#if DUMPOUTPUT
-            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right");
-#endif
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                if (inputIndex == 0)
-                {
-                    descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsInput);
-                }
-                else
-                {
-                    descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsInput);
-                    descriptor->FunctionParam(0, paramOptionsInput);
-                }
-                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->GradientParam();
-                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam(1, paramOptionsInput);
-                descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsOutput);
-                descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsOutput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-        virtual void EvaluateThisNode()   //-sum(left_i * log(softmax_i(right)))
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_softmaxOfRight, m_logSoftmaxOfRight);
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop.");
-        }
-
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
-            Matrix<ElemType>& softmaxOfRight, Matrix<ElemType>& logSoftmaxOfRight)  
-        {
-            logSoftmaxOfRight.AssignLogSoftmaxOf(inputFunctionValues1, true);
-            softmaxOfRight.SetValue(logSoftmaxOfRight);
-            softmaxOfRight.InplaceExp();
-            functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logSoftmaxOfRight);
-            functionValues*=(-1);
-#if NANCHECK
-            functionValues.HasNan("CrossEntropyWithSoftmax");
-#endif
-#if DUMPOUTPUT
-            functionValues.Print("CrossEntropyWithSoftmaxNode");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 2) 
-                throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires two inputs.");
-
-            if (Inputs(0)->OperationName() != L"InputValue" && Inputs(0)->OperationName() != L"SparseInputValue")
-                throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
-
-            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
-            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
-                throw std::logic_error("CrossEntropyWithSoftmaxNode operation: one of the operants has 0 element.");
-
-            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
-                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
-            {
-                throw std::logic_error("The Matrix<ElemType>  dimension in the CrossEntropyWithSoftmaxNode operation does not match.");
-            }       
-
-            FunctionValues().Resize(1,1);
-            CopyImageSizeFromInputs(); 
-
-            m_logSoftmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-            m_softmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;        
-        }
-
-        //leftNode should be the empirical
-        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) 
-        {
-            m_children.resize(2);
-            m_children[0] = label;
-            m_children[1] = prediction;
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_logSoftmaxOfRight.GetDeviceId() != deviceId)
-                {
-                    m_logSoftmaxOfRight.TransferFromDeviceToDevice(m_logSoftmaxOfRight.GetDeviceId(), deviceId,true);
-                }
-                if (m_softmaxOfRight.GetDeviceId() != deviceId)
-                {
-                    m_softmaxOfRight.TransferFromDeviceToDevice(m_softmaxOfRight.GetDeviceId(), deviceId,true);
-                }
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            CrossEntropyWithSoftmaxNode<ElemType>* node = (CrossEntropyWithSoftmaxNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_logSoftmaxOfRight = m_logSoftmaxOfRight;
-                node->m_softmaxOfRight = m_softmaxOfRight;
-            }
-        }
-
-        // copy constructor
-        CrossEntropyWithSoftmaxNode(const CrossEntropyWithSoftmaxNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_logSoftmaxOfRight(node->m_deviceId), m_softmaxOfRight(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new CrossEntropyWithSoftmaxNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-    protected:
-        Matrix<ElemType> m_logSoftmaxOfRight;
-        Matrix<ElemType> m_softmaxOfRight;       
-    };
-
-    template class CrossEntropyWithSoftmaxNode<float>; 
-    template class CrossEntropyWithSoftmaxNode<double>;
-
-    //calculates: -sum(left_i * log(right_i))
-    //assume softmax is already done
-    template<class ElemType>
-    class CrossEntropyNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        CrossEntropyNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        CrossEntropyNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"CrossEntropy";} 
-
-        virtual void ComputeInputPartial(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                throw std::invalid_argument("CrossEntropy criterion only takes two inputs.");
-
-            //left Node must be a scalar
-            if (inputIndex == 0)  //left derivative
-            {
-                ComputeInputPartialLeft(m_logOfRight, Inputs(inputIndex)->GradientValues(), GradientValues());
-            }
-            else
-            {
-                ComputeInputPartialRight(m_leftDivRight, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
-            }
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("CrossEntropy node should never be in a loop.");
-        }
-
-        static void WINAPI ComputeInputPartialLeft(const Matrix<ElemType>& logOfRight, Matrix<ElemType>& inputGradientValues, 
-            const Matrix<ElemType>& gradientValues)  
-        {
-            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logOfRight, inputGradientValues);
-        }
-
-        static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& leftDivRight, 
-            const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1,
-            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
-        {
-            leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1);
-
-            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), leftDivRight, inputGradientValues);
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                if (inputIndex == 0)
-                {
-                    descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsInput);
-                }
-                else
-                {
-                    descriptor->MatrixParam(m_leftDivRight, "leftDivRight", paramOptionsInput | paramOptionsTemporary);
-                    descriptor->FunctionParam(0, paramOptionsInput);
-                    descriptor->FunctionParam(1, paramOptionsInput);
-                }
-                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->GradientParam();
-                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam(1, paramOptionsInput);
-                descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsOutput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-
-        virtual void EvaluateThisNode()   //-sum(left_i * log(right_i))
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_logOfRight);
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("CrossEntropy node should never be in a loop.");
-        }
-
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
-            Matrix<ElemType>& logOfRight)  
-        {
-            logOfRight.SetValue(inputFunctionValues1);
-            logOfRight.InplaceLog();
-            functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logOfRight);
-            functionValues*=(-1);
-#if NANCHECK
-            functionValues.HasNan("CrossEntropy");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 2) 
-                throw std::logic_error("CrossEntropyNode criterion requires two inputs.");
-
-            if (Inputs(0)->OperationName() != L"InputValue")
-                throw std::logic_error("CrossEntropyNode criterion requires the first input to be the label.");
-
-            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
-            size_t index = 0;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            index = 1;
-            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
-            {
-                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
-                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
-                Inputs(index)->FunctionValues().Resize(rows, cols);
-            }
-
-            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
-                throw std::logic_error("CrossEntropyNode operation: one of the operants has 0 element.");
-
-            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
-                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
-            {
-                throw std::logic_error("The Matrix dimension in the CrossEntropyNode operation does not match.");
-            }       
-
-            FunctionValues().Resize(1,1);
-            m_logOfRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols());
-            m_leftDivRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols());
-            CopyImageSizeFromInputs(); 
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;        
-        }
-
-        //leftNode should be the empirical
-        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) 
-        {
-            m_children.resize(2);
-            m_children[0] = label;
-            m_children[1] = prediction;
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_logOfRight.GetDeviceId() != deviceId)
-                {
-                    m_logOfRight.TransferFromDeviceToDevice(m_logOfRight.GetDeviceId(), deviceId,true);
-                }
-                if (m_leftDivRight.GetDeviceId() != deviceId)
-                {
-                    m_leftDivRight.TransferFromDeviceToDevice(m_leftDivRight.GetDeviceId(), deviceId,true);
-                }
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            CrossEntropyNode<ElemType>* node = (CrossEntropyNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_logOfRight = m_logOfRight;
-                node->m_leftDivRight = m_leftDivRight;
-            }
-        }
-
-        // copy constructor
-        CrossEntropyNode(const CrossEntropyNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-                    : ComputationNode<ElemType>(node->m_deviceId), m_logOfRight(node->m_deviceId), m_leftDivRight(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new CrossEntropyNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-    private:
-        // matrix value passed from evaluate to computePartial
-        Matrix<ElemType> m_logOfRight;
-        // temporary
-        Matrix<ElemType> m_leftDivRight;
-    };
-
-    template class CrossEntropyNode<float>; 
-    template class CrossEntropyNode<double>;
-
-    template<class ElemType>
-    class MatrixL1RegNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        MatrixL1RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_gradientOfL1Norm(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        MatrixL1RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_gradientOfL1Norm(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"MatrixL1Reg";} 
-
-        virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples)
-        {
-            if (inputIndex != 0)
-                throw std::invalid_argument("MatrixL1RegNode only has one input.");
-
-            ComputeInputPartialS(m_gradientOfL1Norm, Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues());
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("MatrixL1Reg node should never be in a loop.");
-        }
-
-        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& gradientOfL1Norm, 
-            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& inputFunctionValues)  
-        {
-            gradientOfL1Norm.AssignSignOf(inputFunctionValues);
-            inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), gradientOfL1Norm);
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                descriptor->MatrixParam(m_gradientOfL1Norm, "gradientOfL1Norm", paramOptionsInput | paramOptionsTemporary);
-                descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->GradientParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->SetFunction((FARPROC)ComputeInputPartialS);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-        virtual void EvaluateThisNode()  
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("MatrixL1Reg node should never be in a loop.");
-        }
-
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues)  
-        {
-            functionValues.Resize(1,1);
-            functionValues.SetValue(inputFunctionValues.MatrixNorm1());
-#if NANCHECK
-            functionValues.HasNan("MatrixL1Reg");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 1) 
-                throw std::logic_error("MatrixL1Reg criterion should have one input.");
-
-            if (Inputs(0)->FunctionValues().GetNumElements() == 0)
-                throw std::logic_error("MatrixL1Reg operation: the input node has 0 element.");
-
-            FunctionValues().Resize(1,1);
-            m_gradientOfL1Norm.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-            CopyImageSizeFromInputs(); 
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;
-        }
-
-        virtual void AttachInputs(const ComputationNodePtr singleInput) 
-        {
-            m_children.resize(1);
-            m_children[0] = singleInput;
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_gradientOfL1Norm.GetDeviceId() != deviceId)
-                    m_gradientOfL1Norm.TransferFromDeviceToDevice(m_gradientOfL1Norm.GetDeviceId(), deviceId,true);
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            MatrixL1RegNode<ElemType>* node = (MatrixL1RegNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_gradientOfL1Norm = m_gradientOfL1Norm;
-            }
-        }
-
-        // copy constructor
-        MatrixL1RegNode(const MatrixL1RegNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_gradientOfL1Norm(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new MatrixL1RegNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-    private:
-        // temporary
-        Matrix<ElemType> m_gradientOfL1Norm;
-    };
-
-    template class MatrixL1RegNode<float>; 
-    template class MatrixL1RegNode<double>;
-
-    template<class ElemType>
-    class MatrixL2RegNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        MatrixL2RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_temp(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        MatrixL2RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_temp(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"MatrixL2Reg";} 
-
-
-        virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples)
-        {
-            if (inputIndex != 0)
-                throw std::invalid_argument("MatrixL2RegNode only has one input.");
-
-            ComputeInputPartialS(Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues(), FunctionValues());
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("MatrixL2RegNode node should never be in a loop.");
-        }
-
-        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& /*inputFunctionValues*/, const Matrix<ElemType>& functionValues)  
-        {
-            ElemType v = gradientValues.Get00Element() / (functionValues.Get00Element() + EPS_IN_INVERSE);
-            inputGradientValues.AddWithScaleOf(v, gradientValues);
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->GradientParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam();
-                descriptor->SetFunction((FARPROC)ComputeInputPartialS);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-
-        virtual void EvaluateThisNode()  
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
-        {
-            throw std::logic_error("MatrixL2RegNode node should never be in a loop.");
-        }
-
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues)  
-        {
-            functionValues.Resize(1,1);
-            functionValues.SetValue(inputFunctionValues.FrobeniusNorm());
-#if NANCHECK
-            functionValues.HasNan("MatrixL2Reg");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 1) 
-                throw std::logic_error("MatrixL2Reg criterion should have one input.");
-
-            if (Inputs(0)->FunctionValues().GetNumElements() == 0)
-                throw std::logic_error("MatrixL2Reg operation: the input node has 0 element.");
-
-            FunctionValues().Resize(1,1);
-            CopyImageSizeFromInputs(); 
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;        
-        }
-
-        virtual void AttachInputs(const ComputationNodePtr singleInput) 
-        {
-            m_children.resize(1);
-            m_children[0] = singleInput;
-        }
-
-        // copy constructor
-        MatrixL2RegNode(const MatrixL2RegNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_temp(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new MatrixL2RegNode<ElemType>(this, name, flags);
-            return node;
-        }
-                
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_temp.GetDeviceId() != deviceId)
-                {
-                    m_temp.TransferFromDeviceToDevice(m_temp.GetDeviceId(), deviceId,true);
-                }
-            }
-        }
-
-    private:
-        Matrix<ElemType> m_temp;
-    };
-
-    template class MatrixL2RegNode<float>; 
-    template class MatrixL2RegNode<double>;
-
-    //calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history
-    template<class ElemType>
-    class ClassBasedCrossEntropyWithSoftmaxNode: public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        ClassBasedCrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
-            : ComputationNode<ElemType>(deviceId), m_logSoftmax(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        ClassBasedCrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_logSoftmax(deviceId)
-        {
-            m_nodeName = (name == L""? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-                
-        virtual const std::wstring OperationName() const {return TypeName();}
-        static const std::wstring TypeName() {return L"ClassBasedCrossEntropyWithSoftmax";} 
-
-        virtual void ComputeInputPartial(const size_t inputIndex)  //scaled by 2*number of colmns (samples) in the Matrix<ElemType>
-        {
-            if (inputIndex != 1 && inputIndex != 2)
-                throw std::invalid_argument("ClassCrossEntropyWithSoftmaxNode criterion only takes with respect to input and weight.");
-
-            if (inputIndex == 1)
-                ComputeClassEntropyGradientOfInput(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues());
-            else
-                ComputeClassEntropyGradientOfWeight(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues());
-
-        }
-
-        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
-        {
-            throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop.");
-        }
-
-        static void ComputeClassEntropyGradientOfInput(const Matrix<ElemType>& /*inputFunctionValues0*/, const Matrix<ElemType>& /*inputFunctionValues1*/,
-            const Matrix<ElemType>& inputFunctionValues2, const Matrix<ElemType>* /*clsInfo*/, const Matrix<ElemType>* /*idx2Cls*/,
-            const Matrix<ElemType>& logSoftmax, Matrix<ElemType>& grd)  
-        {
-            logSoftmax.ClassEntropyError(logSoftmax);
-            logSoftmax.ClassEntropyGradientOfInput(logSoftmax, inputFunctionValues2, grd);
-        }
-
-        static void ComputeClassEntropyGradientOfWeight(const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
-            const Matrix<ElemType>& inputFunctionValues2, const Matrix<ElemType>* clsInfo, const Matrix<ElemType>* idx2Cls, 
-            const Matrix<ElemType>& logSoftmax, Matrix<ElemType>& grd)  
-        {
-            logSoftmax.ClassEntropyGradientOfWeight(logSoftmax, 
-                    inputFunctionValues1, inputFunctionValues2, 
-                    inputFunctionValues0, 
-                    clsInfo, idx2Cls, grd);
-        }
-
-        // GetTaskDescriptor - Get a task descriptor for this node
-        // taskType - task type we are generating a task for
-        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
-        {
-            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
-            switch(taskType)
-            {
-            case taskComputeInputPartial:
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam(1, paramOptionsInput);
-                descriptor->FunctionParam(2, paramOptionsInput);
-                descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant);
-                descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant);
-                descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsInput);
-                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
-                descriptor->SetFunction(inputIndex==1?(FARPROC)ComputeClassEntropyGradientOfInput:(FARPROC)ComputeClassEntropyGradientOfWeight);
-                break;
-            case taskEvaluate:
-                descriptor->FunctionParam();
-                descriptor->FunctionParam(0, paramOptionsInput);
-                descriptor->FunctionParam(1, paramOptionsInput);
-                descriptor->FunctionParam(2, paramOptionsInput);
-                descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant);
-                descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant);
-                descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsOutput);
-                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
-                break;
-            default:
-                assert(false);
-                throw std::logic_error("Unsupported task requested");
-            }
-            return descriptor;
-        }
-
-        virtual void EvaluateThisNode()   //-sum(left_i * log(softmax_i(right)))
-        {
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax);
-        }
-
-        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
-        {
-            throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop.");
-        }
-
-        static void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, 
-            const Matrix<ElemType>& inputFunctionValues1, const Matrix<ElemType>& inputFunctionValues2, 
-            const Matrix<ElemType>* clsInfo, const Matrix<ElemType>* idx2Cls, Matrix<ElemType>& logSoftmax)  
-        {            
-            logSoftmax.Resize(inputFunctionValues0.GetNumRows(), inputFunctionValues0.GetNumCols());
-            logSoftmax.ClassEntropy(inputFunctionValues1, inputFunctionValues2, inputFunctionValues0, clsInfo, idx2Cls, logSoftmax, functionValues);
-#if NANCHECK
-            functionValues.HasNan("ClassBasedCrossEntropyWithSoftmax");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 3) 
-                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
-
-            if (Inputs(0)->OperationName() != L"SparseInputValue" 
-                && Inputs(0)->OperationName() != L"InputValue")
-                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
-
-            if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols()  &&  // input and matrix can be timed
-                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols() &&  // label and input same obs numbers
-                Inputs(0)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows() ) ) // label and matrix match output size
-            {
-                throw std::logic_error("The Matrix<ElemType>  dimension in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match.");
-            }       
-
-            FunctionValues().Resize(1,1);
-            CopyImageSizeFromInputs(); 
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(0, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;        
-        }
-
-        //leftNode should be the empirical
-        // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class
-        // the first row indicates the starting row and the second row indicates the end row of a class
-        virtual void AddClassInfo(Matrix<ElemType>* classinfo,
-            Matrix<ElemType>* idx2cls) 
-        {
-            m_ptrClsinfo = classinfo;
-            m_ptrIdx2Cls = idx2cls;
-        }
-
-        //leftNode should be the empirical
-        // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class
-        // the first row indicates the starting row and the second row indicates the end row of a class
-        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr input, 
-            const ComputationNodePtr matrix) 
-        {
-            m_children.resize(3);
-            m_children[0] = label;
-            m_children[1] = input;
-            m_children[2] = matrix;
-
-            //initializes m_logSoftmax
-            m_logSoftmax.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC);
-            m_logSoftmax.Resize(label->FunctionValues().GetNumRows(), label->FunctionValues().GetNumCols());
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_logSoftmax.GetDeviceId() != deviceId)
-                {
-                    m_logSoftmax.TransferFromDeviceToDevice(m_logSoftmax.GetDeviceId(), deviceId,true);
-                }
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* node = (ClassBasedCrossEntropyWithSoftmaxNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_logSoftmax = m_logSoftmax;                
-            }
-        }
-
-        // copy constructor
-        ClassBasedCrossEntropyWithSoftmaxNode(const ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_logSoftmax(node->m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"")?NodeName():newName;
-                
-            ComputationNodePtr node = new ClassBasedCrossEntropyWithSoftmaxNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-    protected:
-        Matrix<ElemType> m_logSoftmax;
-
-        Matrix<ElemType>* m_ptrClsinfo;
-        Matrix<ElemType>* m_ptrIdx2Cls;
-    };
-
-    template class ClassBasedCrossEntropyWithSoftmaxNode<float>; 
-    template class ClassBasedCrossEntropyWithSoftmaxNode<double>;
-
+//
+// <copyright file="TrainingCriterionNode.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <list>
+#include <memory>
+#include "ComputationNode.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+    //note: to save computation the gradient may be scaled by an constant. 
+
+    template<class ElemType>
+    class SquareErrorNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        SquareErrorNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_leftMinusRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        SquareErrorNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_leftMinusRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+                
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"SquareError";} 
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                throw std::invalid_argument("SquareError criteria only takes two inputs.");
+
+            //left Node must be a scalar
+            if (inputIndex == 0)  //left derivative
+            {
+                ComputeInputPartialLeft(Inputs(0)->GradientValues(), GradientValues(), m_leftMinusRight);
+            }
+            else
+            {
+                ComputeInputPartialRight(Inputs(1)->GradientValues(), GradientValues(), m_leftMinusRight);
+            }
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/)
+        {
+            throw std::logic_error("SquareError node should never be in a loop.");
+        }
+
+        static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& leftMinusRight)  
+        {
+            inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), leftMinusRight);
+        }
+
+        static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& leftMinusRight)  
+        {
+            inputGradientValues.AddWithScaleOf(-gradientValues.Get00Element(), leftMinusRight);
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->GradientParam();
+                descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsInput);
+                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam(1, paramOptionsInput);
+                descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsOutput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+        virtual void EvaluateThisNode()  
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_leftMinusRight);
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
+        {
+            throw std::logic_error("SquareError node should never be in a loop.");
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& leftMinusRight)  
+        {
+            leftMinusRight.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1);
+            ElemType v = leftMinusRight.FrobeniusNorm();
+            functionValues.Resize(1,1);
+            functionValues.SetValue(v*v/2);
+#if NANCHECK
+            functionValues.HasNan("SquareError");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 2) 
+                throw std::logic_error("SquareError operation requires two inputs.");
+
+            size_t index = 0;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            index = 1;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
+                throw std::logic_error("SquareError operation: one of the operants has 0 element.");
+
+            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
+                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
+            {
+                throw std::logic_error("The Matrix dimension in the SquareError operation does not match.");
+            }       
+
+            FunctionValues().Resize(1,1);
+            m_leftMinusRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
+            CopyImageSizeFromInputs(); 
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;        
+        }       
+
+        virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode) 
+        {
+            m_children.resize(2);
+            m_children[0] = leftNode;
+            m_children[1] = rightNode;
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_leftMinusRight.GetDeviceId() != deviceId)
+                    m_leftMinusRight.TransferFromDeviceToDevice(m_leftMinusRight.GetDeviceId(), deviceId,true);
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            SquareErrorNode<ElemType>* node = (SquareErrorNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_leftMinusRight = m_leftMinusRight;
+            }
+        }
+
+        // copy constructor
+        SquareErrorNode(const SquareErrorNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_leftMinusRight(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new SquareErrorNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+    private:
+        Matrix<ElemType> m_leftMinusRight;
+    };
+
+    template class SquareErrorNode<float>; 
+    template class SquareErrorNode<double>;
+
+    //calculates: -sum(left_i * log(softmax_i(right)))
+    template<class ElemType>
+    class CrossEntropyWithSoftmaxNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        CrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        CrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+                
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"CrossEntropyWithSoftmax";} 
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                throw std::invalid_argument("CrossEntropyWithSoftmaxNode criterion only takes two inputs.");
+
+            //left Node must be a scalar
+            if (inputIndex == 0)  //left derivative
+            {
+                ComputeInputPartialLeft(m_logSoftmaxOfRight, Inputs(inputIndex)->GradientValues(), GradientValues());
+            }
+            else
+            {
+                ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
+            }
+
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop.");
+        }
+
+        static void WINAPI ComputeInputPartialLeft(const Matrix<ElemType>& logSoftmaxOfRight, Matrix<ElemType>& inputGradientValues, 
+            const Matrix<ElemType>& gradientValues)  
+        {
+#if DUMPOUTPUT
+            logSoftmaxOfRight.Print("CrossEntropyWithSoftmax Partial-logSoftmaxOfRight");
+            gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues");
+            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-in");
+#endif
+
+            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues);
+#if DUMPOUTPUT
+            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-out");
+#endif
+
+        }
+
+        static void WINAPI ComputeInputPartialRight(const Matrix<ElemType>& softmaxOfRight, const Matrix<ElemType>& inputFunctionValues, 
+            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
+        {
+#if DUMPOUTPUT
+            softmaxOfRight.Print("CrossEntropyWithSoftmax Partial-softmaxOfRight");
+            inputFunctionValues.Print("CrossEntropyWithSoftmax Partial-inputFunctionValues");
+            gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues");
+            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right-in");
+#endif
+
+            Matrix<ElemType>::AddScaledDifference(gradientValues, softmaxOfRight, inputFunctionValues, inputGradientValues);
+#if DUMPOUTPUT
+            inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right");
+#endif
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                if (inputIndex == 0)
+                {
+                    descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsInput);
+                }
+                else
+                {
+                    descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsInput);
+                    descriptor->FunctionParam(0, paramOptionsInput);
+                }
+                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->GradientParam();
+                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam(1, paramOptionsInput);
+                descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsOutput);
+                descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsOutput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+        virtual void EvaluateThisNode()   //-sum(left_i * log(softmax_i(right)))
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_softmaxOfRight, m_logSoftmaxOfRight);
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop.");
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
+            Matrix<ElemType>& softmaxOfRight, Matrix<ElemType>& logSoftmaxOfRight)  
+        {
+            logSoftmaxOfRight.AssignLogSoftmaxOf(inputFunctionValues1, true);
+            softmaxOfRight.SetValue(logSoftmaxOfRight);
+            softmaxOfRight.InplaceExp();
+            functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logSoftmaxOfRight);
+            functionValues*=(-1);
+#if NANCHECK
+            functionValues.HasNan("CrossEntropyWithSoftmax");
+#endif
+#if DUMPOUTPUT
+            functionValues.Print("CrossEntropyWithSoftmaxNode");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 2) 
+                throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires two inputs.");
+
+            if (Inputs(0)->OperationName() != L"InputValue" && Inputs(0)->OperationName() != L"SparseInputValue")
+                throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
+
+            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
+            size_t index = 0;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            index = 1;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
+                throw std::logic_error("CrossEntropyWithSoftmaxNode operation: one of the operants has 0 element.");
+
+            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
+                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
+            {
+                throw std::logic_error("The Matrix<ElemType>  dimension in the CrossEntropyWithSoftmaxNode operation does not match.");
+            }       
+
+            FunctionValues().Resize(1,1);
+            CopyImageSizeFromInputs(); 
+
+            m_logSoftmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
+            m_softmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;        
+        }
+
+        //leftNode should be the empirical
+        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) 
+        {
+            m_children.resize(2);
+            m_children[0] = label;
+            m_children[1] = prediction;
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_logSoftmaxOfRight.GetDeviceId() != deviceId)
+                {
+                    m_logSoftmaxOfRight.TransferFromDeviceToDevice(m_logSoftmaxOfRight.GetDeviceId(), deviceId,true);
+                }
+                if (m_softmaxOfRight.GetDeviceId() != deviceId)
+                {
+                    m_softmaxOfRight.TransferFromDeviceToDevice(m_softmaxOfRight.GetDeviceId(), deviceId,true);
+                }
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            CrossEntropyWithSoftmaxNode<ElemType>* node = (CrossEntropyWithSoftmaxNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_logSoftmaxOfRight = m_logSoftmaxOfRight;
+                node->m_softmaxOfRight = m_softmaxOfRight;
+            }
+        }
+
+        // copy constructor
+        CrossEntropyWithSoftmaxNode(const CrossEntropyWithSoftmaxNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_logSoftmaxOfRight(node->m_deviceId), m_softmaxOfRight(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new CrossEntropyWithSoftmaxNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+    protected:
+        Matrix<ElemType> m_logSoftmaxOfRight;
+        Matrix<ElemType> m_softmaxOfRight;       
+    };
+
+    template class CrossEntropyWithSoftmaxNode<float>; 
+    template class CrossEntropyWithSoftmaxNode<double>;
+
+    //calculates: -sum(left_i * log(right_i))
+    //assume softmax is already done
+    template<class ElemType>
+    class CrossEntropyNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        CrossEntropyNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        CrossEntropyNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"CrossEntropy";} 
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                throw std::invalid_argument("CrossEntropy criterion only takes two inputs.");
+
+            //left Node must be a scalar
+            if (inputIndex == 0)  //left derivative
+            {
+                ComputeInputPartialLeft(m_logOfRight, Inputs(inputIndex)->GradientValues(), GradientValues());
+            }
+            else
+            {
+                ComputeInputPartialRight(m_leftDivRight, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
+            }
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("CrossEntropy node should never be in a loop.");
+        }
+
+        static void WINAPI ComputeInputPartialLeft(const Matrix<ElemType>& logOfRight, Matrix<ElemType>& inputGradientValues, 
+            const Matrix<ElemType>& gradientValues)  
+        {
+            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logOfRight, inputGradientValues);
+        }
+
+        static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& leftDivRight, 
+            const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1,
+            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
+        {
+            leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1);
+
+            Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), leftDivRight, inputGradientValues);
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                if (inputIndex == 0)
+                {
+                    descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsInput);
+                }
+                else
+                {
+                    descriptor->MatrixParam(m_leftDivRight, "leftDivRight", paramOptionsInput | paramOptionsTemporary);
+                    descriptor->FunctionParam(0, paramOptionsInput);
+                    descriptor->FunctionParam(1, paramOptionsInput);
+                }
+                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->GradientParam();
+                descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam(1, paramOptionsInput);
+                descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsOutput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+
+        virtual void EvaluateThisNode()   //-sum(left_i * log(right_i))
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_logOfRight);
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("CrossEntropy node should never be in a loop.");
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
+            Matrix<ElemType>& logOfRight)  
+        {
+            logOfRight.SetValue(inputFunctionValues1);
+            logOfRight.InplaceLog();
+            functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logOfRight);
+            functionValues*=(-1);
+#if NANCHECK
+            functionValues.HasNan("CrossEntropy");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 2) 
+                throw std::logic_error("CrossEntropyNode criterion requires two inputs.");
+
+            if (Inputs(0)->OperationName() != L"InputValue")
+                throw std::logic_error("CrossEntropyNode criterion requires the first input to be the label.");
+
+            //we may release the constraint that the first operant is an inputValue later so the following code should be kept
+            size_t index = 0;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            index = 1;
+            if (Inputs(index)->OperationName() == LearnableParameter<ElemType>::TypeName())
+            {
+                size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows();
+                size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols();
+                Inputs(index)->FunctionValues().Resize(rows, cols);
+            }
+
+            if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0)
+                throw std::logic_error("CrossEntropyNode operation: one of the operants has 0 element.");
+
+            if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
+                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )
+            {
+                throw std::logic_error("The Matrix dimension in the CrossEntropyNode operation does not match.");
+            }       
+
+            FunctionValues().Resize(1,1);
+            m_logOfRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols());
+            m_leftDivRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols());
+            CopyImageSizeFromInputs(); 
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;        
+        }
+
+        //leftNode should be the empirical
+        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) 
+        {
+            m_children.resize(2);
+            m_children[0] = label;
+            m_children[1] = prediction;
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_logOfRight.GetDeviceId() != deviceId)
+                {
+                    m_logOfRight.TransferFromDeviceToDevice(m_logOfRight.GetDeviceId(), deviceId,true);
+                }
+                if (m_leftDivRight.GetDeviceId() != deviceId)
+                {
+                    m_leftDivRight.TransferFromDeviceToDevice(m_leftDivRight.GetDeviceId(), deviceId,true);
+                }
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            CrossEntropyNode<ElemType>* node = (CrossEntropyNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_logOfRight = m_logOfRight;
+                node->m_leftDivRight = m_leftDivRight;
+            }
+        }
+
+        // copy constructor
+        CrossEntropyNode(const CrossEntropyNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+                    : ComputationNode<ElemType>(node->m_deviceId), m_logOfRight(node->m_deviceId), m_leftDivRight(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new CrossEntropyNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+    private:
+        // matrix value passed from evaluate to computePartial
+        Matrix<ElemType> m_logOfRight;
+        // temporary
+        Matrix<ElemType> m_leftDivRight;
+    };
+
+    template class CrossEntropyNode<float>; 
+    template class CrossEntropyNode<double>;
+
+    template<class ElemType>
+    class MatrixL1RegNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        MatrixL1RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_gradientOfL1Norm(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        MatrixL1RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_gradientOfL1Norm(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"MatrixL1Reg";} 
+
+        virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples)
+        {
+            if (inputIndex != 0)
+                throw std::invalid_argument("MatrixL1RegNode only has one input.");
+
+            ComputeInputPartialS(m_gradientOfL1Norm, Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues());
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("MatrixL1Reg node should never be in a loop.");
+        }
+
+        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& gradientOfL1Norm, 
+            Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& inputFunctionValues)  
+        {
+            gradientOfL1Norm.AssignSignOf(inputFunctionValues);
+            inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), gradientOfL1Norm);
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                descriptor->MatrixParam(m_gradientOfL1Norm, "gradientOfL1Norm", paramOptionsInput | paramOptionsTemporary);
+                descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->GradientParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->SetFunction((FARPROC)ComputeInputPartialS);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+        virtual void EvaluateThisNode()  
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("MatrixL1Reg node should never be in a loop.");
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues)  
+        {
+            functionValues.Resize(1,1);
+            functionValues.SetValue(inputFunctionValues.MatrixNorm1());
+#if NANCHECK
+            functionValues.HasNan("MatrixL1Reg");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 1) 
+                throw std::logic_error("MatrixL1Reg criterion should have one input.");
+
+            if (Inputs(0)->FunctionValues().GetNumElements() == 0)
+                throw std::logic_error("MatrixL1Reg operation: the input node has 0 element.");
+
+            FunctionValues().Resize(1,1);
+            m_gradientOfL1Norm.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
+            CopyImageSizeFromInputs(); 
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;
+        }
+
+        virtual void AttachInputs(const ComputationNodePtr singleInput) 
+        {
+            m_children.resize(1);
+            m_children[0] = singleInput;
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_gradientOfL1Norm.GetDeviceId() != deviceId)
+                    m_gradientOfL1Norm.TransferFromDeviceToDevice(m_gradientOfL1Norm.GetDeviceId(), deviceId,true);
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            MatrixL1RegNode<ElemType>* node = (MatrixL1RegNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_gradientOfL1Norm = m_gradientOfL1Norm;
+            }
+        }
+
+        // copy constructor
+        MatrixL1RegNode(const MatrixL1RegNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_gradientOfL1Norm(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new MatrixL1RegNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+    private:
+        // temporary
+        Matrix<ElemType> m_gradientOfL1Norm;
+    };
+
+    template class MatrixL1RegNode<float>; 
+    template class MatrixL1RegNode<double>;
+
+    template<class ElemType>
+    class MatrixL2RegNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        MatrixL2RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_temp(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        MatrixL2RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_temp(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"MatrixL2Reg";} 
+
+
+        virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples)
+        {
+            if (inputIndex != 0)
+                throw std::invalid_argument("MatrixL2RegNode only has one input.");
+
+            ComputeInputPartialS(Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues(), FunctionValues());
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("MatrixL2RegNode node should never be in a loop.");
+        }
+
+        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& /*inputFunctionValues*/, const Matrix<ElemType>& functionValues)  
+        {
+            ElemType v = gradientValues.Get00Element() / (functionValues.Get00Element() + EPS_IN_INVERSE);
+            inputGradientValues.AddWithScaleOf(v, gradientValues);
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->GradientParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam();
+                descriptor->SetFunction((FARPROC)ComputeInputPartialS);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+
+        virtual void EvaluateThisNode()  
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
+        {
+            throw std::logic_error("MatrixL2RegNode node should never be in a loop.");
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues)  
+        {
+            functionValues.Resize(1,1);
+            functionValues.SetValue(inputFunctionValues.FrobeniusNorm());
+#if NANCHECK
+            functionValues.HasNan("MatrixL2Reg");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 1) 
+                throw std::logic_error("MatrixL2Reg criterion should have one input.");
+
+            if (Inputs(0)->FunctionValues().GetNumElements() == 0)
+                throw std::logic_error("MatrixL2Reg operation: the input node has 0 element.");
+
+            FunctionValues().Resize(1,1);
+            CopyImageSizeFromInputs(); 
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;        
+        }
+
+        virtual void AttachInputs(const ComputationNodePtr singleInput) 
+        {
+            m_children.resize(1);
+            m_children[0] = singleInput;
+        }
+
+        // copy constructor
+        MatrixL2RegNode(const MatrixL2RegNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_temp(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new MatrixL2RegNode<ElemType>(this, name, flags);
+            return node;
+        }
+                
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_temp.GetDeviceId() != deviceId)
+                {
+                    m_temp.TransferFromDeviceToDevice(m_temp.GetDeviceId(), deviceId,true);
+                }
+            }
+        }
+
+    private:
+        Matrix<ElemType> m_temp;
+    };
+
+    template class MatrixL2RegNode<float>; 
+    template class MatrixL2RegNode<double>;
+
+    //calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history
+    template<class ElemType>
+    class ClassBasedCrossEntropyWithSoftmaxNode: public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        ClassBasedCrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")  
+            : ComputationNode<ElemType>(deviceId), m_logSoftmax(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        ClassBasedCrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_logSoftmax(deviceId)
+        {
+            m_nodeName = (name == L""? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+                
+        virtual const std::wstring OperationName() const {return TypeName();}
+        static const std::wstring TypeName() {return L"ClassBasedCrossEntropyWithSoftmax";} 
+
+        virtual void ComputeInputPartial(const size_t inputIndex)  //scaled by 2*number of colmns (samples) in the Matrix<ElemType>
+        {
+            if (inputIndex != 1 && inputIndex != 2)
+                throw std::invalid_argument("ClassCrossEntropyWithSoftmaxNode criterion only takes with respect to input and weight.");
+
+            if (inputIndex == 1)
+                ComputeClassEntropyGradientOfInput(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues());
+            else
+                ComputeClassEntropyGradientOfWeight(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues());
+
+        }
+
+        virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) 
+        {
+            throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop.");
+        }
+
+        static void ComputeClassEntropyGradientOfInput(const Matrix<ElemType>& /*inputFunctionValues0*/, const Matrix<ElemType>& /*inputFunctionValues1*/,
+            const Matrix<ElemType>& inputFunctionValues2, const Matrix<ElemType>* /*clsInfo*/, const Matrix<ElemType>* /*idx2Cls*/,
+            const Matrix<ElemType>& logSoftmax, Matrix<ElemType>& grd)  
+        {
+            logSoftmax.ClassEntropyError(logSoftmax);
+            logSoftmax.ClassEntropyGradientOfInput(logSoftmax, inputFunctionValues2, grd);
+        }
+
+        static void ComputeClassEntropyGradientOfWeight(const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, 
+            const Matrix<ElemType>& inputFunctionValues2, const Matrix<ElemType>* clsInfo, const Matrix<ElemType>* idx2Cls, 
+            const Matrix<ElemType>& logSoftmax, Matrix<ElemType>& grd)  
+        {
+            logSoftmax.ClassEntropyGradientOfWeight(logSoftmax, 
+                    inputFunctionValues1, inputFunctionValues2, 
+                    inputFunctionValues0, 
+                    clsInfo, idx2Cls, grd);
+        }
+
+        // GetTaskDescriptor - Get a task descriptor for this node
+        // taskType - task type we are generating a task for
+        virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+        {
+            TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+            switch(taskType)
+            {
+            case taskComputeInputPartial:
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam(1, paramOptionsInput);
+                descriptor->FunctionParam(2, paramOptionsInput);
+                descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant);
+                descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant);
+                descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsInput);
+                descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+                descriptor->SetFunction(inputIndex==1?(FARPROC)ComputeClassEntropyGradientOfInput:(FARPROC)ComputeClassEntropyGradientOfWeight);
+                break;
+            case taskEvaluate:
+                descriptor->FunctionParam();
+                descriptor->FunctionParam(0, paramOptionsInput);
+                descriptor->FunctionParam(1, paramOptionsInput);
+                descriptor->FunctionParam(2, paramOptionsInput);
+                descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant);
+                descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant);
+                descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsOutput);
+                descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+                break;
+            default:
+                assert(false);
+                throw std::logic_error("Unsupported task requested");
+            }
+            return descriptor;
+        }
+
+        virtual void EvaluateThisNode()   //-sum(left_i * log(softmax_i(right)))
+        {
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax);
+        }
+
+        virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/)
+        {
+            throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop.");
+        }
+
+        static void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, 
+            const Matrix<ElemType>& inputFunctionValues1, const Matrix<ElemType>& inputFunctionValues2, 
+            const Matrix<ElemType>* clsInfo, const Matrix<ElemType>* idx2Cls, Matrix<ElemType>& logSoftmax)  
+        {            
+            logSoftmax.Resize(inputFunctionValues0.GetNumRows(), inputFunctionValues0.GetNumCols());
+            logSoftmax.ClassEntropy(inputFunctionValues1, inputFunctionValues2, inputFunctionValues0, clsInfo, idx2Cls, logSoftmax, functionValues);
+#if NANCHECK
+            functionValues.HasNan("ClassBasedCrossEntropyWithSoftmax");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 3) 
+                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
+
+            if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
+                && Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
+                throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
+
+            if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols()  &&  // input and matrix can be timed
+                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols() &&  // label and input same obs numbers
+                Inputs(0)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows() ) ) // label and matrix match output size
+            {
+                throw std::logic_error("The Matrix<ElemType>  dimension in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match.");
+            }       
+
+            FunctionValues().Resize(1,1);
+            CopyImageSizeFromInputs(); 
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;        
+        }
+
+        //leftNode should be the empirical
+        // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class
+        // the first row indicates the starting row and the second row indicates the end row of a class
+        virtual void AddClassInfo(Matrix<ElemType>* classinfo,
+            Matrix<ElemType>* idx2cls) 
+        {
+            m_ptrClsinfo = classinfo;
+            m_ptrIdx2Cls = idx2cls;
+        }
+
+        //leftNode should be the empirical
+        // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class
+        // the first row indicates the starting row and the second row indicates the end row of a class
+        virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr input, 
+            const ComputationNodePtr matrix) 
+        {
+            m_children.resize(3);
+            m_children[0] = label;
+            m_children[1] = input;
+            m_children[2] = matrix;
+
+            //initializes m_logSoftmax
+            m_logSoftmax.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC);
+            m_logSoftmax.Resize(label->FunctionValues().GetNumRows(), label->FunctionValues().GetNumCols());
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_logSoftmax.GetDeviceId() != deviceId)
+                {
+                    m_logSoftmax.TransferFromDeviceToDevice(m_logSoftmax.GetDeviceId(), deviceId,true);
+                }
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* node = (ClassBasedCrossEntropyWithSoftmaxNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_logSoftmax = m_logSoftmax;                
+            }
+        }
+
+        // copy constructor
+        ClassBasedCrossEntropyWithSoftmaxNode(const ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_logSoftmax(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"")?NodeName():newName;
+                
+            ComputationNodePtr node = new ClassBasedCrossEntropyWithSoftmaxNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+    protected:
+        Matrix<ElemType> m_logSoftmax;
+
+        Matrix<ElemType>* m_ptrClsinfo;
+        Matrix<ElemType>* m_ptrIdx2Cls;
+    };
+
+    template class ClassBasedCrossEntropyWithSoftmaxNode<float>; 
+    template class ClassBasedCrossEntropyWithSoftmaxNode<double>;
+
 }}}
\ No newline at end of file
diff --git a/MachineLearning/cn/cn.cpp b/MachineLearning/cn/cn.cpp
index d40e369c6..40584fa83 100644
--- a/MachineLearning/cn/cn.cpp
+++ b/MachineLearning/cn/cn.cpp
@@ -1,738 +1,771 @@
-//
-// <copyright file="cn.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// cn.cpp : Defines the entry point for the console application.
-//
-
-#define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
-
-#include "stdafx.h"
-#include "ComputationNetwork.h"
-#include "ComputationNode.h"
-#include "DataReader.h"
-#include "DataWriter.h"
-#include "SimpleNetworkBuilder.h"
-#include "NDLNetworkBuilder.h"
-#include "SynchronousExecutionEngine.h"
-#include "ModelEditLanguage.h"
-#include "SGD.h"
-#include <string>
-#include "commandArgUtil.h"
-#include "SimpleEvaluator.h"
-#include "SimpleOutputWriter.h"
-#include <chrono>
-#include <algorithm>
-#if defined(_WIN32)
-#include "io.h"
-#endif
-#include "hostname.h"
-#ifdef LEAKDETECT
-#include "vld.h" // for memory leak detection
-#endif
-#include <vector>
-#include "BestGpu.h"
-
-// MPI builds on windows require the following installed to "c:\program files\Microsoft MPI\"
-// HPC Pack 2012 R2 MS-MPI Redistributable Package
-// http://www.microsoft.com/en-us/download/details.aspx?id=41634
-
-#ifdef MPI_SUPPORT
-#include "mpi.h"
-#pragma comment(lib, "msmpi.lib")
-#endif
-int numProcs;
-int myRank;
-
-using namespace std;
-using namespace Microsoft::MSR::CNTK;
-
-// internal test routine forward declaration
-template <typename ElemType>
-void TestCn(const ConfigParameters& config);
-
-void RedirectStdErr(wstring logpath)
-{
-    fprintf (stderr, "Redirecting stderr to file %S\n", logpath.c_str());
-    msra::files::make_intermediate_dirs (logpath);
-    auto_file_ptr f (logpath.c_str(), "wb");
-    if (dup2 (fileno (f), 2) == -1)
-        RuntimeError ("unexpected failure to redirect stderr to log file");
-    setvbuf (stderr, NULL, _IONBF, 16384);   // unbuffer it
-}
-
-std::string WCharToString(const wchar_t* wst)
-{
-    std::wstring ws(wst);
-    std::string s(ws.begin(), ws.end());
-    s.assign(ws.begin(), ws.end());
-    return s;
-}
-
-template <typename ElemType>
-void DumpNodeInfo(const ConfigParameters& config)
-{
-    wstring modelPath = config("modelPath");
-    wstring nodeName = config("nodeName",L"__AllNodes__");
-    wstring defOutFilePath = modelPath + L"." + nodeName + L".txt";
-    wstring outputFile = config("outputFile",  WCharToString(defOutFilePath.c_str()).c_str());
-    bool printValues = config("printValues", "true");
-
-    ComputationNetwork<ElemType> net(-1);  //always use CPU
-    net.LoadFromFile(modelPath);
-    net.DumpNodeInfoToFile(nodeName, printValues, outputFile);
-}
-
-template <typename ElemType>
-void DoEvalBase(const ConfigParameters& config, IDataReader<ElemType>& reader)
-{
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-
-    int traceLevel = config("traceLevel", "0");    
-    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
-
-    ConfigArray evalNodeNames = config("evalNodeNames","");
-    vector<wstring> evalNodeNamesVector;
-    for (int i=0; i < evalNodeNames.size(); ++i)
-    {
-        evalNodeNamesVector.push_back(evalNodeNames[i]);
-    }
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
-    eval.Evaluate(reader, evalNodeNamesVector, mbSize[0], epochSize);
-}
-
-template <typename ElemType>
-void DoEval(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DoEvalBase(config, testDataReader);
-}
-
-template <typename ElemType>
-void DoEvalUnroll(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-    wstring path2EvalResults = config("path2EvalResults", L"");
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleEvaluator<ElemType> eval(net);
-    ElemType evalEntropy; 
-    eval.EvaluateUnroll(testDataReader, mbSize[0], evalEntropy,  path2EvalResults == L""? nullptr : path2EvalResults.c_str(), epochSize);
-}
-
-template <typename ElemType>
-void DoCrossValidate(const ConfigParameters& config)
-{
-    //test
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "40960");
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-
-    ConfigArray cvIntervalConfig = config("crossValidationInterval");
-    intargvector cvInterval = cvIntervalConfig;
-
-    size_t sleepSecondsBetweenRuns = config("sleepTimeBetweenRuns", "0"); 
-
-    int traceLevel = config("traceLevel", "0");    
-    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
-
-    ConfigArray evalNodeNames = config("evalNodeNames","");
-    vector<wstring> evalNodeNamesVector;
-    for (int i=0; i < evalNodeNames.size(); ++i)
-    {
-        evalNodeNamesVector.push_back(evalNodeNames[i]);
-    }
-
-    std::vector<std::vector<ElemType>> cvErrorResults;
-    std::vector<std::wstring> cvModels;
-
-    DataReader<ElemType> cvDataReader(readerConfig);
-
-    bool finalModelEvaluated = false;
-    for (size_t i=cvInterval[0]; i<=cvInterval[2]; i+=cvInterval[1])
-    {
-        wstring cvModelPath = msra::strfun::wstrprintf (L"%ls.%lld", modelPath.c_str(), i);
-
-        if (!fexists (cvModelPath)) 
-        {
-            fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
-            if (finalModelEvaluated || !fexists (modelPath))
-                continue; // file missing
-            else 
-            {
-                cvModelPath = modelPath;
-                finalModelEvaluated = true;
-            }
-        }
-
-        cvModels.push_back(cvModelPath);
-        ComputationNetwork<ElemType> net(deviceId);
-        net.LoadFromFile(cvModelPath);
-        net.ResetEvalTimeStamp();
-
-        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
-
-        fprintf(stderr, "model %ls --> \n",cvModelPath.c_str());
-        std::vector<ElemType> evalErrors;
-        evalErrors = eval.Evaluate(cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
-        cvErrorResults.push_back(evalErrors);
-
-        ::Sleep(1000*sleepSecondsBetweenRuns);
-    }
-
-    //find best model
-    if (cvErrorResults.size() == 0)
-        throw std::logic_error("No model is evaluated.");
-
-    std::vector<ElemType> minErrors;
-    std::vector<int> minErrIds;
-    std::vector<ElemType> evalErrors = cvErrorResults[0];
-    for (int i=0; i < evalErrors.size(); ++i)
-    {
-        minErrors.push_back(evalErrors[i]);
-        minErrIds.push_back(0);
-    }
-
-    for (int i=0; i<cvErrorResults.size(); i++)
-    {
-        evalErrors = cvErrorResults[i];
-        for (int j=0; j<evalErrors.size(); j++)
-        {
-            if (evalErrors[j] < minErrors[j])
-            {
-                minErrors[j] = evalErrors[j];
-                minErrIds[j] = i;
-            }        
-        }
-    }
-
-    fprintf(stderr, "Best models:\n");
-    fprintf(stderr,"------------\n");
-    for (int i=0; i < minErrors.size(); ++i)
-    {
-        fprintf(stderr,"Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
-    }
-}
-
-template <typename ElemType>
-void DoWriteOutput(const ConfigParameters& config)
-{
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-    readerConfig.Insert("randomize","None");  //we don't want randomization when output results
-
-    DataReader<ElemType> testDataReader(readerConfig);
-
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-    ConfigArray minibatchSize = config("minibatchSize", "2048");
-    wstring modelPath = config("modelPath");
-    intargvector mbSize = minibatchSize;
-    
-    size_t epochSize = config("epochSize", "0");
-    if (epochSize == 0)
-    {
-        epochSize = requestDataSize;
-    }
-
-    ConfigArray outputNodeNames = config("outputNodeNames","");
-    vector<wstring> outputNodeNamesVector;
-    for (int i=0; i < outputNodeNames.size(); ++i)
-    {
-        outputNodeNamesVector.push_back(outputNodeNames[i]);
-    }
-
-    ComputationNetwork<ElemType> net(deviceId);
-    net.LoadFromFile(modelPath);
-    net.ResetEvalTimeStamp();
-
-    SimpleOutputWriter<ElemType> writer(net, 1);
-
-    if (config.Exists("writer"))
-    {
-        ConfigParameters writerConfig (config("writer"));
-        bool bWriterUnittest = writerConfig("unittest","false");
-        DataWriter<ElemType> testDataWriter(writerConfig);
-        writer.WriteOutput(testDataReader,mbSize[0], testDataWriter, outputNodeNamesVector, epochSize, bWriterUnittest);
-    }
-    else if (config.Exists("outputPath"))
-    {
-        wstring outputPath = config("outputPath"); // crashes if no default given? 
-        writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, epochSize);
-    }
-    //writer.WriteOutput(testDataReader, mbSize[0], testDataWriter, outputNodeNamesVector, epochSize);
-}
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-TrainingCriterion ParseTrainingCriterionString(wstring s)
-{
-    msra::strfun::tolower_ascii(s);
-    if (s==L"crossentropywithsoftmax")
-        return TrainingCriterion::CrossEntropyWithSoftmax;
-    else if (s==L"squareerror")
-        return TrainingCriterion::SquareError;
-    else if (s!=L"classcrossentropywithsoftmax")    // (twisted logic to keep compiler happy w.r.t. not returning from LogicError)
-        LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (CrossEntropyWithSoftmax | SquareError | ClassCrossEntropyWithSoftmax)");
-    return TrainingCriterion::ClassCrossEntropyWithSoftmax;
-}
-
-EvalCriterion ParseEvalCriterionString(wstring s)
-{
-    msra::strfun::tolower_ascii(s);
-    if (s==L"errorprediction")
-        return EvalCriterion::ErrorPrediction;
-    else if (s==L"crossentropywithsoftmax")
-        return EvalCriterion::CrossEntropyWithSoftmax;
-    else if (s==L"classcrossentropywithsoftmax")
-        return EvalCriterion::ClassCrossEntropyWithSoftmax;
-    else if (s!=L"squareerror")
-        LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (ErrorPrediction | CrossEntropyWithSoftmax | SquareError)");
-    return EvalCriterion::SquareError;
-}
-
-}}};
-
-template <typename ElemType>
-void DoCreateLabelMap(const ConfigParameters& config)
-{
-    // this gets the section name we are interested in
-    std::string section = config("section");
-    // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution)
-    ConfigParameters configSection (config(section));
-    ConfigParameters readerConfig (configSection("reader"));
-    readerConfig.Insert("allowMapCreation","true");
-    DEVICEID_TYPE deviceId = CPUDEVICE;
-    size_t minibatchSize = config("minibatchSize", "2048");
-    int traceLevel = config("traceLevel","0");
-    std::vector<std::wstring> featureNames;
-    std::vector<std::wstring> labelNames;
-    GetFileConfigNames(readerConfig, featureNames, labelNames);
-
-    // setup minibatch matrices
-    Matrix<ElemType> featuresMatrix(deviceId);
-    Matrix<ElemType> labelsMatrix(deviceId);
-    std::map<std::wstring, Matrix<ElemType>*> matrices;
-    matrices[featureNames[0]] = &featuresMatrix;
-    if (labelNames.size() == 0)
-        RuntimeError("CreateLabelMap: no labels found to process");
-
-    // now create the reader and loop through the entire dataset to get all the labels
-    auto start = std::chrono::system_clock::now();
-    for (const std::wstring& labelsName: labelNames)
-    {
-        // take the last label file defined (the other one might be input)
-        matrices[labelsName] = &labelsMatrix;
-
-        // get the label mapping file name
-        ConfigParameters labelConfig (readerConfig(labelsName));
-        std::string labelMappingFile;
-        if (labelConfig.ExistsCurrent("labelMappingFile"))
-            labelMappingFile = labelConfig("labelMappingFile");
-        else if (readerConfig.ExistsCurrent("labelMappingFile")) 
-            labelMappingFile = labelConfig("labelMappingFile");
-        else
-            RuntimeError("CreateLabelMap: No labelMappingFile defined");
-
-        if (fexists(labelMappingFile))
-        {
-            fprintf(stderr,"CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str());
-            return;
-        }
-        fprintf(stderr,"CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str());
-
-        DataReader<ElemType> dataReader(readerConfig);
-
-        dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize);
-        int count = 0;
-        while (dataReader.GetMinibatch(matrices))
-        {
-            Matrix<ElemType>& features = *matrices[featureNames[0]];
-            count += features.GetNumCols();
-            if (traceLevel > 1)
-                fprintf(stderr,"."); // progress meter
-        }
-        dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize);
-
-        // print the results
-        if (traceLevel > 0)
-            fprintf(stderr,"\nread %d labels and produced %s\n", count, labelMappingFile.c_str());
-    }
-    auto end = std::chrono::system_clock::now();
-    auto elapsed = end-start;
-    if (traceLevel > 1)
-        fprintf(stderr, "%f seconds elapsed\n", (float)(std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count())/1000);
-}
-
-
-template <typename ElemType>
-void DoTrain(const ConfigParameters& config)
-{
-    ConfigParameters configSGD (config("SGD"));
-    bool makeMode = config("makeMode", "true");
-
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    IComputationNetBuilder<ElemType>* netBuilder = NULL;
-
-    if (config.Exists("NDLNetworkBuilder"))
-    {
-        ConfigParameters configNDL (config("NDLNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new NDLBuilder<ElemType>(configNDL);
-    }
-    else if (config.Exists("SimpleNetworkBuilder"))
-    {
-        ConfigParameters configSNB (config("SimpleNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(configSNB);
-    }
-    else
-    {
-        RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified" );
-    }
-
-    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
-
-    DataReader<ElemType>* cvDataReader = nullptr;
-    ConfigParameters cvReaderConfig (config("cvReader", L""));
-    
-    if (cvReaderConfig.size() != 0)
-    {
-        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
-        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
-    }
-
-    SGD<ElemType> sgd(configSGD);
-
-    sgd.Train(netBuilder, dataReader, cvDataReader, makeMode);
-
-    delete netBuilder;
-    delete dataReader;
-    delete cvDataReader;
-}
-
-template <typename ElemType>
-void DoAdapt(const ConfigParameters& config)
-{
-    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
-
-    ConfigParameters configSGD (config("SGD"));
-    bool makeMode = config("makeMode", "true");
-
-    ConfigParameters readerConfig (config("reader"));
-    readerConfig.Insert("traceLevel",config("traceLevel","0"));
-
-    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
-
-    DataReader<ElemType>* cvDataReader = nullptr;
-    ConfigParameters cvReaderConfig (config("cvReader", L""));
-    
-    if (cvReaderConfig.size() != 0)
-    {
-        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
-        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
-    }
-
-    wstring origModelFileName = config("origModelFileName", L"");
-    wstring refNodeName = config("refNodeName", L"");
-
-    SGD<ElemType> sgd(configSGD);
-
-    sgd.Adapt(origModelFileName, refNodeName, dataReader, cvDataReader, deviceId, makeMode);
-
-    delete dataReader;
-    delete cvDataReader;
-}
-
-template <typename ElemType>
-void DoEdit(const ConfigParameters& config)
-{
-    wstring editPath = config("editPath");    
-    wstring ndlMacros = config("ndlMacros","");
-    NDLScript<ElemType> ndlScript;
-    if (!ndlMacros.empty())
-        ndlScript.LoadConfigFile(ndlMacros);
-    MELScript<ElemType> melScript;
-    melScript.LoadConfigFileAndResolveVariables(editPath, config);
-}
-
-template <typename ElemType>
-void DoConvertFromDbn(const ConfigParameters& config)
-{
-    //config.Insert("deviceId","-1"); //force using CPU
-
-    wstring modelPath = config("modelPath");
-    wstring dbnModelPath = config("dbnModelPath");
-
-    IComputationNetBuilder<ElemType>* netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(config);        
-    ComputationNetwork<ElemType>& net = netBuilder->LoadNetworkFromFile(dbnModelPath);
-    net.SaveToFile(modelPath);
-    delete (netBuilder);
-}
-// process the command
-template <typename ElemType>
-void DoCommand(const ConfigParameters& config)
-{
-    ConfigArray command = config("command", "train");
-    for (int i=0; i < command.size(); i++)
-    {
-        //get the configuration parameters that match the command
-        ConfigParameters commandParams (config(command[i]));
-        ConfigArray action = commandParams("action","train");
-
-        // determine the action to perform, and do it
-        for (int j=0; j < action.size(); j++)
-        {
-            if (action[j] == "train" || action[j] == "trainRNN")
-                DoTrain<ElemType>(commandParams);
-            else if (action[j] == "adapt")
-                DoAdapt<ElemType>(commandParams);
-            else if (action[j] == "test" || action[j] == "eval")
-                DoEval<ElemType>(commandParams);
-            else if (action[j] == "testunroll")
-                DoEvalUnroll<ElemType>(commandParams);
-            else if (action[j] == "edit")
-                DoEdit<ElemType>(commandParams);
-            else if (action[j] == "cv")
-                DoCrossValidate<ElemType>(commandParams);
-            else if (action[j] == "write")
-                DoWriteOutput<ElemType>(commandParams);
-            else if (action[j] == "devtest")
-                TestCn<ElemType>(config); // for "devtest" action pass the root config instead
-            else if (action[j] == "dumpnode")
-                DumpNodeInfo<ElemType>(commandParams);
-            else if (action[j] == "convertdbn")
-                DoConvertFromDbn<ElemType>(commandParams);
-            else if (action[j] == "createLabelMap")
-                DoCreateLabelMap<ElemType>(commandParams);
-            else
-                RuntimeError("unknown action: %s  in command set: %s", action[j].c_str(), command[i].c_str());
-                
-            NDLScript<ElemType> ndlScript;
-            ndlScript.ClearGlobal(); // clear global macros between commands
-        }
-    }
-}
-
-std::string TimeDateStamp()
-{
-#if 0   // "safe" version for Windows, not needed it seems
-    __time64_t localtime;
-
-    _time64 (&localtime);// get current time and date
-    struct tm now;
-    _localtime64_s (&now, &localtime);  // convert
-#else
-    time_t t = time(NULL);
-    struct tm now = *localtime(&t);
-#endif
-    char buf[30];
-    sprintf (buf, "%04d/%02d/%02d %02d:%02d:%02d", now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
-    return buf;
-}
-
-#ifdef MPI_SUPPORT
-// Oh, my gosh, this is going to be ugly. MPI_INIT needs a char* argv[], so let's interface.
-int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
-{
-    // this maps from the strings 
-    std::map<std::string, wchar_t*> recover_wstring;
-
-    // do the mapping to 8-bit encoding for MPI_Init()
-    vector<vector<char>> argv_string_vector;
-    transform(*argv, *argv + *argc, std::back_inserter(argv_string_vector),
-        [&recover_wstring](wchar_t*pws)->vector<char>
-        { 
-            std::string tmp = msra::strfun::utf8(std::wstring(pws));
-            recover_wstring[tmp] = pws;
-            vector<char> rv(tmp.begin(), tmp.end());
-            rv.push_back('\0');
-            return rv;
-        }
-        );
-    vector<char*> argv_charptr_vector;
-    transform(argv_string_vector.begin(), argv_string_vector.end(), std::back_inserter(argv_charptr_vector),
-        [](std::vector<char>&cs)->char*{ return &(cs[0]); }
-        );
-    char** argv_char = &(argv_charptr_vector[0]);
-
-    // Do the initialization
-    int rv = MPI_Init(argc, &argv_char);
-
-    // try and reconstruct how MPI_Init changed the argv
-    transform(argv_char, argv_char + *argc, stdext::checked_array_iterator<wchar_t**>(*argv, *argc),
-        [&recover_wstring](char*pc)->wchar_t*
-        {
-            auto it = recover_wstring.find(std::string(pc));
-            if (it == recover_wstring.end())
-                RuntimeError("Unexpected interaction between MPI_Init and command line parameters");
-            return it->second;
-        }
-        );
-
-    // pass through return value from internal call to MPI_Init()
-    return rv;
-}
-#endif
-
-int wmain(int argc, wchar_t* argv[])
-{
-    try
-    {
-#ifdef MPI_SUPPORT
-        {
-            int rc;
-            rc = MPI_Init(&argc, &argv);
-            if (rc != MPI_SUCCESS)
-            {
-                MPI_Abort(MPI_COMM_WORLD, rc);
-                RuntimeError("Failure in MPI_Init: %d", rc);
-            }
-            MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
-            MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
-            fprintf(stderr, "MPI: RUNNING ON (%s), process %d/%d\n", getenv("COMPUTERNAME"), myRank, numProcs);
-            fflush(stderr);
-        }
-#else
-        numProcs = 1;
-        myRank = 0;
-#endif
-
-        ConfigParameters config;
-        std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
-
-        // get the command param set they want
-        wstring logpath = config("stderr", L"");
-        ConfigArray command = config("command", "train");
-
-        if (logpath != L"")
-        {
-            for (int i=0; i < command.size(); i++)
-            {
-                logpath += L"_";
-                logpath += (wstring)command[i];
-            }
-            logpath += L".log";
-            if (numProcs > 1)
-            {
-                std::wostringstream oss;
-                oss << myRank;
-                logpath += L"rank" + oss.str();
-            }
-            RedirectStdErr(logpath);
-        }
-
-        std::string timestamp = TimeDateStamp();
-
-        if (myRank == 0) // main process
-        {
-            //dump config info
-            fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
-            fprintf(stderr, "command line options: \n");
-            for (int i = 1; i < argc; i++)
-                fprintf(stderr, "%s ", WCharToString(argv[i]).c_str());
-
-            // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
-            // and prints it.
-            fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
-            fprintf(stderr, "%s\n", rawConfigString.c_str());
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");
-
-            // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overriden at command line),
-            // All of these assignments will appear, even though only the last assignment matters.
-            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
-            fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
-
-            // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
-            // value it is set to will appear).
-            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
-            config.dumpWithResolvedVariables();
-            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
-
-            fprintf(stderr, "command: ");
-            for (int i = 0; i < command.size(); i++)
-            {
-                fprintf(stderr, "%s ", command[i].c_str());
-            }
-        }
-
-        //run commands
-        std::string type = config("precision", "float");
-        // accept old precision key for backward compatibility
-        if (config.Exists("type"))
-            type = config("type", "float");
-        if ( myRank == 0 )
-            fprintf(stderr, "\nprecision = %s\n", type.c_str());
-        if (type == "float")
-            DoCommand<float>(config);
-        else if (type == "double")
-            DoCommand<double>(config);
-        else
-            RuntimeError("invalid precision specified: %s", type.c_str());
-    }
-    catch(const std::exception &err)
-    {
-        fprintf(stderr, "EXCEPTION occurred: %s", err.what());
-#ifdef _DEBUG
-        DebugBreak();
-#endif
-        return EXIT_FAILURE;
-    }
-    catch(...)
-    {
-        fprintf(stderr, "Unknown ERROR occurred");
-#ifdef _DEBUG
-        DebugBreak();
-#endif
-        return EXIT_FAILURE;
-    }    
-#ifdef MPI_SUPPORT
-    MPI_Finalize();
-#endif
-    return EXIT_SUCCESS;
-}
+//
+// <copyright file="cn.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// cn.cpp : Defines the entry point for the console application.
+//
+
+#define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
+
+#include "stdafx.h"
+#include "ComputationNetwork.h"
+#include "ComputationNode.h"
+#include "DataReader.h"
+#include "DataWriter.h"
+#include "SimpleNetworkBuilder.h"
+#include "NDLNetworkBuilder.h"
+#include "SynchronousExecutionEngine.h"
+#include "ModelEditLanguage.h"
+#include "SGD.h"
+#include <string>
+#include "commandArgUtil.h"
+#include "SimpleEvaluator.h"
+#include "SimpleOutputWriter.h"
+#include <chrono>
+#include <algorithm>
+#if defined(_WIN32)
+#include "io.h"
+#endif
+#include "hostname.h"
+#include "buildinfo.h"
+#ifdef LEAKDETECT
+#include "vld.h" // for memory leak detection
+#endif
+#include <vector>
+#include "BestGpu.h"
+
+// MPI builds on windows require the following installed to "c:\program files\Microsoft MPI\"
+// HPC Pack 2012 R2 MS-MPI Redistributable Package
+// http://www.microsoft.com/en-us/download/details.aspx?id=41634
+
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#pragma comment(lib, "msmpi.lib")
+#endif
+int numProcs;
+int myRank;
+
+using namespace std;
+using namespace Microsoft::MSR::CNTK;
+
+// internal test routine forward declaration
+template <typename ElemType>
+void TestCn(const ConfigParameters& config);
+
+void RedirectStdErr(wstring logpath)
+{
+    fprintf (stderr, "Redirecting stderr to file %S\n", logpath.c_str());
+    msra::files::make_intermediate_dirs (logpath);
+    auto_file_ptr f (logpath.c_str(), "wb");
+    if (dup2 (fileno (f), 2) == -1)
+        RuntimeError ("unexpected failure to redirect stderr to log file");
+    setvbuf (stderr, NULL, _IONBF, 16384);   // unbuffer it
+}
+
+std::string WCharToString(const wchar_t* wst)
+{
+    std::wstring ws(wst);
+    std::string s(ws.begin(), ws.end());
+    s.assign(ws.begin(), ws.end());
+    return s;
+}
+
+template <typename ElemType>
+void DumpNodeInfo(const ConfigParameters& config)
+{
+    wstring modelPath = config("modelPath");
+    wstring nodeName = config("nodeName",L"__AllNodes__");
+    wstring defOutFilePath = modelPath + L"." + nodeName + L".txt";
+    wstring outputFile = config("outputFile",  WCharToString(defOutFilePath.c_str()).c_str());
+    bool printValues = config("printValues", "true");
+
+    ComputationNetwork<ElemType> net(-1);  //always use CPU
+    net.LoadFromFile(modelPath);
+    net.DumpNodeInfoToFile(nodeName, printValues, outputFile);
+}
+
+template <typename ElemType>
+void DoEvalBase(const ConfigParameters& config, IDataReader<ElemType>& reader)
+{
+    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
+    ConfigArray minibatchSize = config("minibatchSize", "40960");
+    size_t epochSize = config("epochSize", "0");
+    if (epochSize == 0)
+    {
+        epochSize = requestDataSize;
+    }
+    wstring modelPath = config("modelPath");
+    intargvector mbSize = minibatchSize;
+
+    int traceLevel = config("traceLevel", "0");    
+    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
+
+    ConfigArray evalNodeNames = config("evalNodeNames","");
+    vector<wstring> evalNodeNamesVector;
+    for (int i=0; i < evalNodeNames.size(); ++i)
+    {
+        evalNodeNamesVector.push_back(evalNodeNames[i]);
+    }
+
+    ComputationNetwork<ElemType> net(deviceId);
+    net.LoadFromFile(modelPath);
+    net.ResetEvalTimeStamp();
+
+    SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
+    eval.Evaluate(reader, evalNodeNamesVector, mbSize[0], epochSize);
+}
+
+template <typename ElemType>
+void DoEval(const ConfigParameters& config)
+{
+    //test
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+
+    DataReader<ElemType> testDataReader(readerConfig);
+
+    DoEvalBase(config, testDataReader);
+}
+
+template <typename ElemType>
+void DoEvalUnroll(const ConfigParameters& config)
+{
+    //test
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+
+    DataReader<ElemType> testDataReader(readerConfig);
+
+    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
+    ConfigArray minibatchSize = config("minibatchSize", "40960");
+    size_t epochSize = config("epochSize", "0");
+    if (epochSize == 0)
+    {
+        epochSize = requestDataSize;
+    }
+    wstring modelPath = config("modelPath");
+    intargvector mbSize = minibatchSize;
+    wstring path2EvalResults = config("path2EvalResults", L"");
+
+    ComputationNetwork<ElemType> net(deviceId);
+    net.LoadFromFile(modelPath);
+    net.ResetEvalTimeStamp();
+
+    SimpleEvaluator<ElemType> eval(net);
+    ElemType evalEntropy; 
+    eval.EvaluateUnroll(testDataReader, mbSize[0], evalEntropy,  path2EvalResults == L""? nullptr : path2EvalResults.c_str(), epochSize);
+}
+
+template <typename ElemType>
+void DoCrossValidate(const ConfigParameters& config)
+{
+    //test
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+
+    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
+    ConfigArray minibatchSize = config("minibatchSize", "40960");
+    size_t epochSize = config("epochSize", "0");
+    if (epochSize == 0)
+    {
+        epochSize = requestDataSize;
+    }
+    wstring modelPath = config("modelPath");
+    intargvector mbSize = minibatchSize;
+
+    ConfigArray cvIntervalConfig = config("crossValidationInterval");
+    intargvector cvInterval = cvIntervalConfig;
+
+    size_t sleepSecondsBetweenRuns = config("sleepTimeBetweenRuns", "0"); 
+
+    int traceLevel = config("traceLevel", "0");    
+    size_t numMBsToShowResult = config("numMBsToShowResult", "100");
+
+    ConfigArray evalNodeNames = config("evalNodeNames","");
+    vector<wstring> evalNodeNamesVector;
+    for (int i=0; i < evalNodeNames.size(); ++i)
+    {
+        evalNodeNamesVector.push_back(evalNodeNames[i]);
+    }
+
+    std::vector<std::vector<ElemType>> cvErrorResults;
+    std::vector<std::wstring> cvModels;
+
+    DataReader<ElemType> cvDataReader(readerConfig);
+
+    bool finalModelEvaluated = false;
+    for (size_t i=cvInterval[0]; i<=cvInterval[2]; i+=cvInterval[1])
+    {
+        wstring cvModelPath = msra::strfun::wstrprintf (L"%ls.%lld", modelPath.c_str(), i);
+
+        if (!fexists (cvModelPath)) 
+        {
+            fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
+            if (finalModelEvaluated || !fexists (modelPath))
+                continue; // file missing
+            else 
+            {
+                cvModelPath = modelPath;
+                finalModelEvaluated = true;
+            }
+        }
+
+        cvModels.push_back(cvModelPath);
+        ComputationNetwork<ElemType> net(deviceId);
+        net.LoadFromFile(cvModelPath);
+        net.ResetEvalTimeStamp();
+
+        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel);
+
+        fprintf(stderr, "model %ls --> \n",cvModelPath.c_str());
+        std::vector<ElemType> evalErrors;
+        evalErrors = eval.Evaluate(cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
+        cvErrorResults.push_back(evalErrors);
+
+        ::Sleep(1000*sleepSecondsBetweenRuns);
+    }
+
+    //find best model
+    if (cvErrorResults.size() == 0)
+        throw std::logic_error("No model is evaluated.");
+
+    std::vector<ElemType> minErrors;
+    std::vector<int> minErrIds;
+    std::vector<ElemType> evalErrors = cvErrorResults[0];
+    for (int i=0; i < evalErrors.size(); ++i)
+    {
+        minErrors.push_back(evalErrors[i]);
+        minErrIds.push_back(0);
+    }
+
+    for (int i=0; i<cvErrorResults.size(); i++)
+    {
+        evalErrors = cvErrorResults[i];
+        for (int j=0; j<evalErrors.size(); j++)
+        {
+            if (evalErrors[j] < minErrors[j])
+            {
+                minErrors[j] = evalErrors[j];
+                minErrIds[j] = i;
+            }        
+        }
+    }
+
+    fprintf(stderr, "Best models:\n");
+    fprintf(stderr,"------------\n");
+    for (int i=0; i < minErrors.size(); ++i)
+    {
+        fprintf(stderr,"Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
+    }
+}
+
+template <typename ElemType>
+void DoWriteOutput(const ConfigParameters& config)
+{
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+    readerConfig.Insert("randomize","None");  //we don't want randomization when output results
+
+    DataReader<ElemType> testDataReader(readerConfig);
+
+    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
+    ConfigArray minibatchSize = config("minibatchSize", "2048");
+    wstring modelPath = config("modelPath");
+    intargvector mbSize = minibatchSize;
+    
+    size_t epochSize = config("epochSize", "0");
+    if (epochSize == 0)
+    {
+        epochSize = requestDataSize;
+    }
+
+    ConfigArray outputNodeNames = config("outputNodeNames","");
+    vector<wstring> outputNodeNamesVector;
+    for (int i=0; i < outputNodeNames.size(); ++i)
+    {
+        outputNodeNamesVector.push_back(outputNodeNames[i]);
+    }
+
+    ComputationNetwork<ElemType> net(deviceId);
+    net.LoadFromFile(modelPath);
+    net.ResetEvalTimeStamp();
+
+    SimpleOutputWriter<ElemType> writer(net, 1);
+
+    if (config.Exists("writer"))
+    {
+        ConfigParameters writerConfig (config("writer"));
+        bool bWriterUnittest = writerConfig("unittest","false");
+        DataWriter<ElemType> testDataWriter(writerConfig);
+        writer.WriteOutput(testDataReader,mbSize[0], testDataWriter, outputNodeNamesVector, epochSize, bWriterUnittest);
+    }
+    else if (config.Exists("outputPath"))
+    {
+        wstring outputPath = config("outputPath"); // crashes if no default given? 
+        writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, epochSize);
+    }
+    //writer.WriteOutput(testDataReader, mbSize[0], testDataWriter, outputNodeNamesVector, epochSize);
+}
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+TrainingCriterion ParseTrainingCriterionString(wstring s)
+{
+    msra::strfun::tolower_ascii(s);
+    if (s==L"crossentropywithsoftmax")
+        return TrainingCriterion::CrossEntropyWithSoftmax;
+    else if (s==L"squareerror")
+        return TrainingCriterion::SquareError;
+    else if (s!=L"classcrossentropywithsoftmax")    // (twisted logic to keep compiler happy w.r.t. not returning from LogicError)
+        LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (CrossEntropyWithSoftmax | SquareError | ClassCrossEntropyWithSoftmax)");
+    return TrainingCriterion::ClassCrossEntropyWithSoftmax;
+}
+
+EvalCriterion ParseEvalCriterionString(wstring s)
+{
+    msra::strfun::tolower_ascii(s);
+    if (s==L"errorprediction")
+        return EvalCriterion::ErrorPrediction;
+    else if (s==L"crossentropywithsoftmax")
+        return EvalCriterion::CrossEntropyWithSoftmax;
+    else if (s==L"classcrossentropywithsoftmax")
+        return EvalCriterion::ClassCrossEntropyWithSoftmax;
+    else if (s!=L"squareerror")
+        LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (ErrorPrediction | CrossEntropyWithSoftmax | SquareError)");
+    return EvalCriterion::SquareError;
+}
+
+}}};
+
+template <typename ElemType>
+void DoCreateLabelMap(const ConfigParameters& config)
+{
+    // this gets the section name we are interested in
+    std::string section = config("section");
+    // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution)
+    ConfigParameters configSection (config(section));
+    ConfigParameters readerConfig (configSection("reader"));
+    readerConfig.Insert("allowMapCreation","true");
+    DEVICEID_TYPE deviceId = CPUDEVICE;
+    size_t minibatchSize = config("minibatchSize", "2048");
+    int traceLevel = config("traceLevel","0");
+    std::vector<std::wstring> featureNames;
+    std::vector<std::wstring> labelNames;
+    GetFileConfigNames(readerConfig, featureNames, labelNames);
+
+    // setup minibatch matrices
+    Matrix<ElemType> featuresMatrix(deviceId);
+    Matrix<ElemType> labelsMatrix(deviceId);
+    std::map<std::wstring, Matrix<ElemType>*> matrices;
+    matrices[featureNames[0]] = &featuresMatrix;
+    if (labelNames.size() == 0)
+        RuntimeError("CreateLabelMap: no labels found to process");
+
+    // now create the reader and loop through the entire dataset to get all the labels
+    auto start = std::chrono::system_clock::now();
+    for (const std::wstring& labelsName: labelNames)
+    {
+        // take the last label file defined (the other one might be input)
+        matrices[labelsName] = &labelsMatrix;
+
+        // get the label mapping file name
+        ConfigParameters labelConfig (readerConfig(labelsName));
+        std::string labelMappingFile;
+        if (labelConfig.ExistsCurrent("labelMappingFile"))
+            labelMappingFile = labelConfig("labelMappingFile");
+        else if (readerConfig.ExistsCurrent("labelMappingFile")) 
+            labelMappingFile = labelConfig("labelMappingFile");
+        else
+            RuntimeError("CreateLabelMap: No labelMappingFile defined");
+
+        if (fexists(labelMappingFile))
+        {
+            fprintf(stderr,"CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str());
+            return;
+        }
+        fprintf(stderr,"CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str());
+
+        DataReader<ElemType> dataReader(readerConfig);
+
+        dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize);
+        int count = 0;
+        while (dataReader.GetMinibatch(matrices))
+        {
+            Matrix<ElemType>& features = *matrices[featureNames[0]];
+            count += features.GetNumCols();
+            if (traceLevel > 1)
+                fprintf(stderr,"."); // progress meter
+        }
+        dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize);
+
+        // print the results
+        if (traceLevel > 0)
+            fprintf(stderr,"\nread %d labels and produced %s\n", count, labelMappingFile.c_str());
+    }
+    auto end = std::chrono::system_clock::now();
+    auto elapsed = end-start;
+    if (traceLevel > 1)
+        fprintf(stderr, "%f seconds elapsed\n", (float)(std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count())/1000);
+}
+
+
+template <typename ElemType>
+void DoTrain(const ConfigParameters& config)
+{
+    ConfigParameters configSGD (config("SGD"));
+    bool makeMode = config("makeMode", "true");
+
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+
+    IComputationNetBuilder<ElemType>* netBuilder = NULL;
+
+    if (config.Exists("NDLNetworkBuilder"))
+    {
+        ConfigParameters configNDL (config("NDLNetworkBuilder"));
+        netBuilder = (IComputationNetBuilder<ElemType>*)new NDLBuilder<ElemType>(configNDL);
+    }
+    else if (config.Exists("SimpleNetworkBuilder"))
+    {
+        ConfigParameters configSNB (config("SimpleNetworkBuilder"));
+        netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(configSNB);
+    }
+    else
+    {
+        RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified" );
+    }
+
+    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
+
+    DataReader<ElemType>* cvDataReader = nullptr;
+    ConfigParameters cvReaderConfig (config("cvReader", L""));
+    
+    if (cvReaderConfig.size() != 0)
+    {
+        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
+        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
+    }
+
+    SGD<ElemType> sgd(configSGD);
+
+    sgd.Train(netBuilder, dataReader, cvDataReader, makeMode);
+
+    delete netBuilder;
+    delete dataReader;
+    delete cvDataReader;
+}
+
+template <typename ElemType>
+void DoAdapt(const ConfigParameters& config)
+{
+    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
+
+    ConfigParameters configSGD (config("SGD"));
+    bool makeMode = config("makeMode", "true");
+
+    ConfigParameters readerConfig (config("reader"));
+    readerConfig.Insert("traceLevel",config("traceLevel","0"));
+
+    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
+
+    DataReader<ElemType>* cvDataReader = nullptr;
+    ConfigParameters cvReaderConfig (config("cvReader", L""));
+    
+    if (cvReaderConfig.size() != 0)
+    {
+        cvReaderConfig.Insert("traceLevel",config("traceLevel","0"));
+        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
+    }
+
+    wstring origModelFileName = config("origModelFileName", L"");
+    wstring refNodeName = config("refNodeName", L"");
+
+    SGD<ElemType> sgd(configSGD);
+
+    sgd.Adapt(origModelFileName, refNodeName, dataReader, cvDataReader, deviceId, makeMode);
+
+    delete dataReader;
+    delete cvDataReader;
+}
+
+template <typename ElemType>
+void DoEdit(const ConfigParameters& config)
+{
+    wstring editPath = config("editPath");    
+    wstring ndlMacros = config("ndlMacros","");
+    NDLScript<ElemType> ndlScript;
+    if (!ndlMacros.empty())
+        ndlScript.LoadConfigFile(ndlMacros);
+    MELScript<ElemType> melScript;
+    melScript.LoadConfigFileAndResolveVariables(editPath, config);
+}
+
+template <typename ElemType>
+void DoConvertFromDbn(const ConfigParameters& config)
+{
+    //config.Insert("deviceId","-1"); //force using CPU
+
+    wstring modelPath = config("modelPath");
+    wstring dbnModelPath = config("dbnModelPath");
+
+    IComputationNetBuilder<ElemType>* netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(config);        
+    ComputationNetwork<ElemType>& net = netBuilder->LoadNetworkFromFile(dbnModelPath);
+    net.SaveToFile(modelPath);
+    delete (netBuilder);
+}
+// process the command
+template <typename ElemType>
+void DoCommand(const ConfigParameters& config)
+{
+    ConfigArray command = config("command", "train");
+    for (int i=0; i < command.size(); i++)
+    {
+        //get the configuration parameters that match the command
+        ConfigParameters commandParams (config(command[i]));
+        ConfigArray action = commandParams("action","train");
+
+        // determine the action to perform, and do it
+        for (int j=0; j < action.size(); j++)
+        {
+            if (action[j] == "train" || action[j] == "trainRNN")
+                DoTrain<ElemType>(commandParams);
+            else if (action[j] == "adapt")
+                DoAdapt<ElemType>(commandParams);
+            else if (action[j] == "test" || action[j] == "eval")
+                DoEval<ElemType>(commandParams);
+            else if (action[j] == "testunroll")
+                DoEvalUnroll<ElemType>(commandParams);
+            else if (action[j] == "edit")
+                DoEdit<ElemType>(commandParams);
+            else if (action[j] == "cv")
+                DoCrossValidate<ElemType>(commandParams);
+            else if (action[j] == "write")
+                DoWriteOutput<ElemType>(commandParams);
+            else if (action[j] == "devtest")
+                TestCn<ElemType>(config); // for "devtest" action pass the root config instead
+            else if (action[j] == "dumpnode")
+                DumpNodeInfo<ElemType>(commandParams);
+            else if (action[j] == "convertdbn")
+                DoConvertFromDbn<ElemType>(commandParams);
+            else if (action[j] == "createLabelMap")
+                DoCreateLabelMap<ElemType>(commandParams);
+            else
+                RuntimeError("unknown action: %s  in command set: %s", action[j].c_str(), command[i].c_str());
+                
+            NDLScript<ElemType> ndlScript;
+            ndlScript.ClearGlobal(); // clear global macros between commands
+        }
+    }
+}
+
+std::string TimeDateStamp()
+{
+#if 0   // "safe" version for Windows, not needed it seems
+    __time64_t localtime;
+
+    _time64 (&localtime);// get current time and date
+    struct tm now;
+    _localtime64_s (&now, &localtime);  // convert
+#else
+    time_t t = time(NULL);
+    struct tm now = *localtime(&t);
+#endif
+    char buf[30];
+    sprintf (buf, "%04d/%02d/%02d %02d:%02d:%02d", now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
+    return buf;
+}
+
+#ifdef MPI_SUPPORT
+// Oh, my gosh, this is going to be ugly. MPI_INIT needs a char* argv[], so let's interface.
+int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
+{
+    // this maps from the strings 
+    std::map<std::string, wchar_t*> recover_wstring;
+
+    // do the mapping to 8-bit encoding for MPI_Init()
+    vector<vector<char>> argv_string_vector;
+    transform(*argv, *argv + *argc, std::back_inserter(argv_string_vector),
+        [&recover_wstring](wchar_t*pws)->vector<char>
+        { 
+            std::string tmp = msra::strfun::utf8(std::wstring(pws));
+            recover_wstring[tmp] = pws;
+            vector<char> rv(tmp.begin(), tmp.end());
+            rv.push_back('\0');
+            return rv;
+        }
+        );
+    vector<char*> argv_charptr_vector;
+    transform(argv_string_vector.begin(), argv_string_vector.end(), std::back_inserter(argv_charptr_vector),
+        [](std::vector<char>&cs)->char*{ return &(cs[0]); }
+        );
+    char** argv_char = &(argv_charptr_vector[0]);
+
+    // Do the initialization
+    int rv = MPI_Init(argc, &argv_char);
+
+    // try and reconstruct how MPI_Init changed the argv
+    transform(argv_char, argv_char + *argc, stdext::checked_array_iterator<wchar_t**>(*argv, *argc),
+        [&recover_wstring](char*pc)->wchar_t*
+        {
+            auto it = recover_wstring.find(std::string(pc));
+            if (it == recover_wstring.end())
+                RuntimeError("Unexpected interaction between MPI_Init and command line parameters");
+            return it->second;
+        }
+        );
+
+    // pass through return value from internal call to MPI_Init()
+    return rv;
+}
+#endif
+
+void PrintBuiltInfo()
+{
+	fprintf(stderr, "-------------------------------------------------------------------\n");
+	fprintf(stderr, "Build info: \n\n");
+	fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
+	fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
+	fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
+	fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
+#ifdef _GIT_EXIST
+	fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
+	fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
+#endif
+	fprintf(stderr, "-------------------------------------------------------------------\n");
+
+}
+
+
+int wmain(int argc, wchar_t* argv[])
+{
+
+    try
+    {
+#ifdef MPI_SUPPORT
+        {
+            int rc;
+            rc = MPI_Init(&argc, &argv);
+            if (rc != MPI_SUCCESS)
+            {
+                MPI_Abort(MPI_COMM_WORLD, rc);
+                RuntimeError("Failure in MPI_Init: %d", rc);
+            }
+            MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
+            MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
+            fprintf(stderr, "MPI: RUNNING ON (%s), process %d/%d\n", getenv("COMPUTERNAME"), myRank, numProcs);
+            fflush(stderr);
+        }
+#else
+        numProcs = 1;
+        myRank = 0;
+#endif
+
+        ConfigParameters config;
+        std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
+
+        // get the command param set they want
+        wstring logpath = config("stderr", L"");
+		//  [1/26/2015 erw, add done file so that it can be used on HPC]
+		wstring DoneFile = config("DoneFile", L"");
+        ConfigArray command = config("command", "train");
+
+        if (logpath != L"")
+        {
+            for (int i=0; i < command.size(); i++)
+            {
+                logpath += L"_";
+                logpath += (wstring)command[i];
+            }
+            logpath += L".log";
+            if (numProcs > 1)
+            {
+                std::wostringstream oss;
+                oss << myRank;
+                logpath += L"rank" + oss.str();
+            }
+
+			RedirectStdErr(logpath);
+        }
+
+
+		PrintBuiltInfo();
+
+
+        std::string timestamp = TimeDateStamp();
+
+        if (myRank == 0) // main process
+        {
+            //dump config info
+            fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
+            fprintf(stderr, "command line options: \n");
+            for (int i = 1; i < argc; i++)
+                fprintf(stderr, "%s ", WCharToString(argv[i]).c_str());
+
+            // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
+            // and prints it.
+            fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
+            fprintf(stderr, "%s\n", rawConfigString.c_str());
+            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");
+
+            // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overriden at command line),
+            // All of these assignments will appear, even though only the last assignment matters.
+            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+            fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
+            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
+
+            // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
+            // value it is set to will appear).
+            fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+            config.dumpWithResolvedVariables();
+            fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
+
+            fprintf(stderr, "command: ");
+            for (int i = 0; i < command.size(); i++)
+            {
+                fprintf(stderr, "%s ", command[i].c_str());
+            }
+        }
+
+        //run commands
+        std::string type = config("precision", "float");
+        // accept old precision key for backward compatibility
+        if (config.Exists("type"))
+            type = config("type", "float");
+        if ( myRank == 0 )
+            fprintf(stderr, "\nprecision = %s\n", type.c_str());
+        if (type == "float")
+            DoCommand<float>(config);
+        else if (type == "double")
+            DoCommand<double>(config);
+        else
+            RuntimeError("invalid precision specified: %s", type.c_str());
+
+		// still here , write a DoneFile if necessary 
+		if (!DoneFile.empty()){
+			FILE* fp = fopenOrDie(DoneFile.c_str(), L"w");
+			fprintf(fp, "successfully finished at %s on %s\n",  TimeDateStamp().c_str(),GetHostName().c_str());
+			fcloseOrDie(fp);
+		}
+	}
+	catch (const std::exception &err)
+	{
+        fprintf(stderr, "EXCEPTION occurred: %s", err.what());
+#ifdef _DEBUG
+        DebugBreak();
+#endif
+        return EXIT_FAILURE;
+    }
+    catch(...)
+    {
+        fprintf(stderr, "Unknown ERROR occurred");
+#ifdef _DEBUG
+        DebugBreak();
+#endif
+        return EXIT_FAILURE;
+    }    
+#ifdef MPI_SUPPORT
+    MPI_Finalize();
+#endif
+    return EXIT_SUCCESS;
+}
diff --git a/MachineLearning/cn/cn.vcxproj b/MachineLearning/cn/cn.vcxproj
index 4f301d7aa..090f87a0b 100644
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@@ -139,6 +139,9 @@
       <TreatOutputAsContent>true</TreatOutputAsContent>
       <Message>Copy content files to target directory</Message>
     </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
@@ -199,6 +202,9 @@
       <Message>
       </Message>
     </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
   </ItemDefinitionGroup>
   <ItemGroup>
     <Text Include="DefaultMacros.txt" />
@@ -216,6 +222,7 @@
     <ClInclude Include="..\..\Common\Include\fileutil.h" />
     <ClInclude Include="..\..\Common\Include\hostname.h" />
     <ClInclude Include="..\..\Common\Include\nvml.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
     <ClInclude Include="CompositeComputationNode.h" />
     <ClInclude Include="ComputationNetwork.h" />
     <ClInclude Include="ComputationNetworkHelper.h" />
@@ -249,6 +256,7 @@
     <ClCompile Include="..\..\Common\fileutil.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
     <ClCompile Include="cn.cpp" />
     <ClCompile Include="ComputationNode.cpp" />
     <ClCompile Include="ModelEditLanguage.cpp" />
diff --git a/MachineLearning/cn/cn.vcxproj.filters b/MachineLearning/cn/cn.vcxproj.filters
index 65605bc86..0d071e72a 100644
--- a/MachineLearning/cn/cn.vcxproj.filters
+++ b/MachineLearning/cn/cn.vcxproj.filters
@@ -43,6 +43,9 @@
     <ClCompile Include="NetworkDescriptionLanguage.cpp">
       <Filter>Network</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\Common\Include\basetypes.h">
@@ -138,6 +141,9 @@
     <ClInclude Include="..\..\Common\Include\hostname.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Text Include="modelEditor.txt">
diff --git a/MachineLearning/cn/prebuild.bat b/MachineLearning/cn/prebuild.bat
new file mode 100644
index 000000000..8759732a9
--- /dev/null
+++ b/MachineLearning/cn/prebuild.bat
@@ -0,0 +1,30 @@
+@echo off 
+
+
+echo #ifndef _BUILDINFO_H > buildinfo.h
+echo #define _BUILDINFO_H >> buildinfo.h 
+
+
+FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i           
+:: assuming hostname always exists 
+
+:: not sure whether git in path ? 
+git --version 2 > nul 
+if not %ERRORLEVEL% == 9909 (
+    echo #define _GIT_EXIST >> buildinfo.h
+    FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i
+    FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i
+    echo #define _BUILDBRANCH_  "%BRANCH%"      >> buildinfo.h
+    echo #define _BUILDSHA1_    "%COMMIT%"      >> buildinfo.h
+) 
+
+
+echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h 
+echo #define _BUILDMACHINE_ "%HOST%"    >> buildinfo.h
+
+set a=%~dp0
+set buildpath="%a:\=\\%"
+echo #define _BUILDPATH_    %buildpath%     >> buildinfo.h
+
+
+echo #endif >> buildinfo.h
diff --git a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
index 65837721b..ecd4e2f48 100644
--- a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
+++ b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
@@ -1,217 +1,264 @@
-//
-// <copyright file="MatrixUnitTests.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#include "stdafx.h"
-#include "CppUnitTest.h"
-#include "..\Math\Matrix.h"
-
-#pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project
-
-#define epsilon 0.000001
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-
-using namespace Microsoft::MSR::CNTK;
-using namespace Microsoft::VisualStudio::CppUnitTestFramework;
-
-
-namespace CNTKMathTest
-{    
-    TEST_CLASS(MatrixUnitTest)
-    {        
-
-    public:
-
-        //This test should fail if you don't have CUDA GPU (or working under remote desktop)
-        TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple)
-        {
-            Matrix<float> A;
-            A.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);
-            long n0 = A.MatrixNorm0();
-            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); 
-            A.SwitchToMatrixType(MatrixType::SPARSE);
-            Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType());
-            long n1 = A.MatrixNorm0();
-            Assert::AreEqual<long>(n0,n1);
-            A.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType());            
-        }
-
-        TEST_METHOD(MatrixSparseTimesDense)
-        {
-            Matrix<float> Ad; //DENSE
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE
-            Matrix<float> As(Ad);//DENSE
-            As.SwitchToMatrixType(MatrixType::SPARSE);  //!!! MATRIX As becomes sparse
-            Matrix<float> B = Matrix<float>::RandomGaussian(2048,128,1,4); //DENSE
-            Matrix<float> C = Matrix<float>::RandomGaussian(4096,128,1,2); //DENSE
-            Matrix<float> C1(C); //DENSE
-
-            float alpha = 0.3, beta = 2;
-            bool transposeA=false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE            
-            Assert::IsTrue(C1.IsEqualTo(C,0.00001));            
-        }
-
-        TEST_METHOD(MatrixDenseTimesSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
-
-            Matrix<float> B = Matrix<float>::RandomGaussian(2048,1024,1,4);
-            Matrix<float> C = Matrix<float>::RandomGaussian(2048,2048,1,2);
-            Matrix<float> C1(C);
-
-            float alpha = 0.3, beta = 0;
-            bool transposeA=false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
-            Assert::IsTrue(C1.IsEqualTo(C,0.0001));  
-
-            alpha = 3.3, beta = 1.3;            
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
-            Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
-        }
-
-        TEST_METHOD(MatrixSparseTimesSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048,1024,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            Matrix<float> Cd;
-            Cd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,1024,-4,0.2,0),0);
-            Matrix<float> Cs(Cd);
-
-            float alpha = 2.4, beta=0;
-            bool transposeA = false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
-            Cs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001));  
-
-
-            alpha = 2.4, beta=3.4; 
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);            
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
-            Cs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); 
-        }
-
-        TEST_METHOD(MatrixSparsePlusSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,As,Bs);
-
-            Bs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixDensePlusSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bs);
-
-            Bs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixSparsePlusDense)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bd1(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,As,Bd1);
-
-            Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixSparseElementWisePower)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-            Matrix<float> As(Ad);
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Ad^=2.3; //DENSE
-            As^=2.3; //SPARSE
-            Assert::IsTrue(As.IsEqualTo(Ad,0.00001));
-            Assert::IsTrue(Ad.IsEqualTo(As,0.00001));
-
-            Bd.AssignElementPowerOf(Ad,3.2);
-            Bs.AssignElementPowerOf(As,3.2);
-#ifdef CHECK
-            Bs.SwitchToMatrixType(DENSE);
-            Bd.TransferFromDeviceToDevice(0,CPUDEVICE);
-            Bs.TransferFromDeviceToDevice(0,CPUDEVICE);
-            for (int r = 0; r < Bd.GetNumRows(); ++r)
-                for (int c = 0; c < Bd.GetNumCols(); ++c)
-                {
-                    float dVal = Bd(r,c);
-                    float sVal = Bs(r,c);
-                    float diff = sVal - dVal;
-                    if (fabs(diff) > 0.001)
-                        cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal;
-                }
-#endif
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001));
-            Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001));
-        }
-    };
+//
+// <copyright file="MatrixUnitTests.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#include "stdafx.h"
+#include "CppUnitTest.h"
+#include "..\Math\Matrix.h"
+
+#pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project
+
+#define epsilon 0.000001
+#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
+
+using namespace Microsoft::MSR::CNTK;
+using namespace Microsoft::VisualStudio::CppUnitTestFramework;
+
+
+namespace CNTKMathTest
+{    
+    TEST_CLASS(MatrixUnitTest)
+    {        
+
+    public:
+
+        //This test should fail if you don't have CUDA GPU (or working under remote desktop)
+        TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple)
+        {
+            Matrix<float> A;
+            A.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);
+            long n0 = A.MatrixNorm0();
+            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); 
+            A.SwitchToMatrixType(MatrixType::SPARSE);
+            Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType());
+            long n1 = A.MatrixNorm0();
+            Assert::AreEqual<long>(n0,n1);
+            A.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType());            
+        }
+
+        TEST_METHOD(MatrixSparseTimesDense)
+        {
+            Matrix<float> Ad; //DENSE
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE
+            Matrix<float> As(Ad);//DENSE
+            As.SwitchToMatrixType(MatrixType::SPARSE);  //!!! MATRIX As becomes sparse
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048,128,1,4); //DENSE
+            Matrix<float> C = Matrix<float>::RandomGaussian(4096,128,1,2); //DENSE
+            Matrix<float> C1(C); //DENSE
+
+            float alpha = 0.3, beta = 2;
+            bool transposeA=false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE            
+            Assert::IsTrue(C1.IsEqualTo(C,0.00001));            
+        }
+
+        TEST_METHOD(MatrixDenseTimesSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048,1024,1,4);
+            Matrix<float> C = Matrix<float>::RandomGaussian(2048,2048,1,2);
+            Matrix<float> C1(C);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA=false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
+            Assert::IsTrue(C1.IsEqualTo(C,0.0001));  
+
+            alpha = 3.3, beta = 1.3;            
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
+            Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
+        }
+
+        TEST_METHOD(CPUMatrixDenseTimesSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024, 2048, -3, 0.1, 0), 0);
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C1(C);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+            Assert::IsTrue(C1.IsEqualTo(C, 0.0001));
+
+            alpha = 3.3, beta = 1.3;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(C1.IsEqualTo(C, 0.00005));
+        }
+        
+        TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048, 1024, -3, 0.1, 0), 0);
+
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsCsc = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsBlock(CPUDEVICE);
+            AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = true;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001));
+        }
+
+        TEST_METHOD(MatrixSparseTimesSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048,1024,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            Matrix<float> Cd;
+            Cd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,1024,-4,0.2,0),0);
+            Matrix<float> Cs(Cd);
+
+            float alpha = 2.4, beta=0;
+            bool transposeA = false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
+            Cs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001));  
+
+
+            alpha = 2.4, beta=3.4; 
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);            
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
+            Cs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); 
+        }
+
+        TEST_METHOD(MatrixSparsePlusSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,As,Bs);
+
+            Bs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixDensePlusSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bs);
+
+            Bs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixSparsePlusDense)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bd1(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,As,Bd1);
+
+            Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixSparseElementWisePower)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Ad^=2.3; //DENSE
+            As^=2.3; //SPARSE
+            Assert::IsTrue(As.IsEqualTo(Ad,0.00001));
+            Assert::IsTrue(Ad.IsEqualTo(As,0.00001));
+
+            Bd.AssignElementPowerOf(Ad,3.2);
+            Bs.AssignElementPowerOf(As,3.2);
+#ifdef CHECK
+            Bs.SwitchToMatrixType(DENSE);
+            Bd.TransferFromDeviceToDevice(0,CPUDEVICE);
+            Bs.TransferFromDeviceToDevice(0,CPUDEVICE);
+            for (int r = 0; r < Bd.GetNumRows(); ++r)
+                for (int c = 0; c < Bd.GetNumCols(); ++c)
+                {
+                    float dVal = Bd(r,c);
+                    float sVal = Bs(r,c);
+                    float diff = sVal - dVal;
+                    if (fabs(diff) > 0.001)
+                        cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal;
+                }
+#endif
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001));
+            Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001));
+        }
+    };
 }
\ No newline at end of file
diff --git a/Math/Math/CPUSparseMatrix.cpp b/Math/Math/CPUSparseMatrix.cpp
index 0e2fbeca9..266e3981f 100644
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@@ -1,944 +1,962 @@
-//
-// <copyright file="CPUSparseMatrix.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// Math.cpp : Defines the exported functions for the DLL application.
-//
-
-#include "stdafx.h"
-#include <assert.h>
-#include <stdexcept>
-#include <omp.h>
-#include <math.h>
-#include "CPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include <random>
-#include <chrono>
-#ifdef    _WIN32
-#include <Windows.h>
-#endif
-#ifdef LEAKDETECT
-#include <vld.h>
-#endif
-
-#include "basetypes.h"
-#include "fileutil.h"
-
-
-#ifndef USE_MKL
-// use ACML as default. 
-// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above 
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64 variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h>  // requires ACML 5.3.0 and above
-#else
-// requires MKL 10.0 and above
-#endif
-
-// This is an example of an exported variable
-//MATH_API int nMath=0;
-
-// This is an example of an exported function.
-//MATH_API int fnMath(void)
-//{
-//    return 42;
-//}
-
-#ifndef USE_MKL  //MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR 
-#else
-#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, 
-#endif
-
-#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);}
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-#pragma region Helpful Enum Definitions
-    enum class MatrixOrder
-    {
-        RowMajor = 101,  // row-major arrays 
-        ColMajor = 102  // column-major arrays 
-    };
-
-    enum class MatrixTranspose : char
-    {
-        NoTrans = 'N', // trans='N'
-        Trans = 'T', // trans='T' 
-        ConjTrans = 'C' // trans='C'
-    };
-
-    enum class SymMatrixType : char
-    {
-        Up = 'U', // symmetric matrix is stored in the upper part
-        Low = 'L', // symmetric matrix is stored in thelower part
-        Full = 'F', //full populated
-        NotSymmetric = 'N' //not a symmetric matrix
-    };
-
-    enum class MatrixOpSide : char
-    {
-        Left = 'L', // left multiply
-        Right = 'R', // right multiply
-    };
-#pragma endregion Helpful Enum Definitions
-
-#pragma region Constructors and Destructor
-
-    //should only be used by constructors.
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ZeroInit()
-    {   
-        m_numRows = 0;
-        m_numCols = 0;
-        m_elemSizeAllocated = 0;
-        m_compIndexSize = 0;
-        m_externalBuffer = false;
-        m_computeDevice = CPUDEVICE;
-        m_nz = 0;
-        m_matrixName = NULL;   
-
-        //if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            m_colIdx = -1;
-            m_pArray = NULL;
-            m_unCompIndex = NULL;
-            m_compIndex = NULL;
-        } 
-        //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            m_blockSize = 0;      
-            m_blockVal = NULL;
-            m_blockIds = NULL;
-        }
-    }
-
-    //should only be used by constructors.
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
-    {
-        if (format != MatrixFormat::matrixFormatSparseCSC && format != MatrixFormat::matrixFormatSparseCSR && format != MatrixFormat::matrixFormatSparseBlockCol && format != MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            throw std::logic_error("CPUSparseMatrix:  unsupported sparse matrix format");
-        }
-        m_format = format;
-        ZeroInit();
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
-    {
-        CheckInit(format);
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
-    {
-        CheckInit(format);
-        Resize(numRows, numCols, size);
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
-    {       
-        if (m_matrixName!=NULL) 
-        {
-            delete[] m_matrixName;
-            m_matrixName = nullptr;
-        }
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            if(m_pArray != NULL) 
-                delete[] m_pArray;
-            if(m_unCompIndex != NULL) 
-                delete[] m_unCompIndex;
-            if(m_compIndex != NULL)
-                delete[] m_compIndex;
-        }  
-        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            if(m_blockVal != NULL) 
-                delete[] m_blockVal;
-            if(m_blockIds != NULL) 
-                delete[] m_blockIds;
-        }
-    }
-
-
-
-#pragma endregion Constructors and Destructor
-
-#pragma region Basic Operators
-
-    //make sure call order in colume wise for CSC and row wise for CSR
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, const ElemType v)
-    {
-        if(m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseCSR) 
-        {
-            throw std::logic_error("CPUSparseMatrix:  unsupported SetValue() call.");
-        }
-
-        if(m_elemSizeAllocated < m_nz +1) //automatic resize
-        {
-            Resize(m_numRows, m_numCols, m_nz + 100);  //allocate 100 more elelemnts and keep existing values
-        }
-
-        if(row < 0 || row >= m_numRows) 
-        {
-            throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id");
-        }
-
-        if(col < 0 || col >= m_numCols) {
-            throw std::logic_error("CPUSparseMatrix: SetValue() invalid column id");
-        }
-
-        size_t r = (m_format == matrixFormatSparseCSC) ? row: col;
-        size_t c = (m_format == matrixFormatSparseCSC) ? col: row;
-
-        m_pArray[m_nz] = v;
-        m_unCompIndex[m_nz] = (CPUSPARSE_INDEX_TYPE)r;
-
-        //consistency check
-        if(c == m_colIdx && r <= m_unCompIndex[m_nz-1]) 
-        {
-            throw std::logic_error("CPUSparseMatrix:  SetValue is not called properly");
-        }
-
-        if (c != m_colIdx) 
-        {
-            m_compIndex[c] = CPUSPARSE_INDEX_TYPE(m_nz);
-            m_colIdx = (int) c;
-        } 
-        m_compIndex[c + 1] = CPUSPARSE_INDEX_TYPE(m_nz + 1);
-        m_nz++;
-    }
-
-    template<class ElemType>
-    ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            return m_pArray;
-        }  
-        else
-        {
-            return m_blockVal;
-        }
-    }
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
-    {               
-        size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
-        bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize);
-
-        m_numRows = numRows;
-        m_numCols = numCols;
-
-        if (reallocate)
-        {                
-            if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
-            {
-                ElemType *pArray = new ElemType[numNZElemToReserve];
-                CPUSPARSE_INDEX_TYPE *unCompIndex = new CPUSPARSE_INDEX_TYPE[numNZElemToReserve];
-                CPUSPARSE_INDEX_TYPE *compIndex = new CPUSPARSE_INDEX_TYPE[newCompIndexSize];
-                
-                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
-                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
-
-                if (keepExistingValues && m_nz > 0)
-                {
-                    assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve);
-                    memcpy(pArray, m_pArray, NzSize());
-                    memcpy(unCompIndex, m_unCompIndex, MajorIndexSize());
-                    memcpy(compIndex, m_compIndex, SecondaryIndexSize());
-                }
-
-                if (m_pArray != NULL)
-                    delete[] m_pArray;
-                if (m_unCompIndex != NULL)
-                    delete[] m_unCompIndex;
-                if (m_compIndex != NULL)
-                    delete[] m_compIndex;
-
-                m_pArray = pArray;
-                m_unCompIndex = unCompIndex;
-                m_compIndex = compIndex;
-            }
-            else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-            {
-                ElemType *blockVal = new ElemType[numNZElemToReserve];
-                size_t *blockIds = new size_t[newCompIndexSize];
-
-                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
-                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
-
-                if (keepExistingValues && m_elemSizeAllocated > 0)
-                {
-                    assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
-                    memcpy(blockVal, m_blockVal, NzSize());
-                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
-                }
-
-                if (m_blockVal != NULL)
-                    delete[] m_blockVal;
-                if(m_blockIds != NULL) 
-                    delete[] m_blockIds;
-
-                m_blockVal = blockVal;
-                m_blockIds = blockIds;
-            }
-
-            m_elemSizeAllocated = numNZElemToReserve;
-            m_compIndexSize = newCompIndexSize;
-        }
-    }
-
-    //Reset matrix so it can be reused
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::Reset()
-    {                
-        m_nz = 0;
-        m_colIdx = -1;
-        m_blockSize = 0;
-    }
-
-    //c = alpha*op(lhs) * op(rhs) + beta*c
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
-        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
-
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty())
-            throw std::logic_error("MultiplyAndWeightedAdd:  one of the input matrix is empty.");
-
-        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
-        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
-        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
-        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
-
-        assert (m>0 && k>0 && l>0 && n>0);  //converting from size_t to int may cause overflow
-        assert (k == l);
-        if (k != l) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match.");
-        }
-
-        if (c.GetNumRows() != m || c.GetNumCols() != n) 
-        {
-            c.Resize(m,n);
-        }         
-
-        if (beta == 0)
-        {
-            memset(c.GetArray(), 0, sizeof(ElemType) * c.GetNumElements());
-        }
-        else if (beta != 1)
-        {
-#pragma omp parallel for
-            foreach_coord(i,j,c)
-            {
-                c(i,j) = beta * c(i,j); 
-            } 
-        }
-
-        if (rhs.GetFormat() != matrixFormatSparseCSC)
-            NOT_IMPLEMENTED;
-
-        if (!transposeA && !transposeB)
-        {
-            for(size_t j = 0; j < rhs.GetNumCols(); j++) 
-            {
-                size_t start = rhs.m_compIndex[j];  //ColLocation
-                size_t end = rhs.m_compIndex[j+1];
-                for(size_t p = start; p < end; p++)
-                { 
-                    size_t i = rhs.m_unCompIndex[p]; //RowLocation
-                    ElemType val = rhs.m_pArray[p];
-
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
-                    {
-                        c(h,j) += alpha * lhs(h, i)*val; 
-                    }
-                }
-            }
-        }
-        else if (!transposeA && transposeB)
-        {           
-            for(size_t j = 0; j < rhs.GetNumCols(); j++)
-            { 
-                size_t start = rhs.m_compIndex[j];
-                size_t end = rhs.m_compIndex[j + 1];
-
-                for(size_t p = start; p < end; p++)
-                { 
-                    size_t i = rhs.m_unCompIndex[p];
-                    ElemType val = rhs.m_pArray[p];
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
-                    {                     
-                        c(h, i) += alpha * lhs(h, j)*val;
-                    }
-                }
-            }           
-        }
-        else if (transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    //c = alpha * op(lhs) * op(rhs)
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
-        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty())
-            throw std::logic_error("LeftMultiplyAndAdd:  one of the input matrix is empty.");
-
-        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
-        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
-        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
-        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
-
-        assert (m>0 && k>0 && l>0 && n>0); m; n;  //converting from size_t to int may cause overflow
-        assert (k == l);
-        if (k != l) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match.");
-        }
-
-        c.Reset();
-
-        if (!transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else if (!transposeA && transposeB)
-        {           
-            if (rhs.GetFormat() != matrixFormatSparseCSC)
-                NOT_IMPLEMENTED;
-
-            //allocate enough memory
-            c.SetFormat(matrixFormatSparseBlockCol);
-            c.Resize(m, n, m*min(n, rhs.m_nz));
-
-            map<size_t, size_t> w2Id;
-            for(size_t j = 0; j < rhs.GetNumCols(); j++)
-            { // j ranges over batches
-                size_t start = rhs.m_compIndex[j];
-                size_t end = rhs.m_compIndex[j+1];
-
-                for(size_t p = start; p < end; p++) 
-                { 
-                    size_t i = rhs.m_unCompIndex[p]; //i ranges over words
-                    ElemType val = rhs.m_pArray[p]; //1 for(i, j)
-
-                    bool first = true;
-                    if(w2Id.find(i) == w2Id.end()) 
-                    {
-                        w2Id[i] = w2Id.size();
-                        c.m_blockIds[c.m_blockSize]=i;
-                        c.m_blockSize++;
-                    } 
-                    else 
-                    {
-                        first = false;
-                    }
-                    size_t pos = w2Id[i] * lhs.GetNumRows();
-                    for(size_t h = 0; h < lhs.GetNumRows(); h++) 
-                    { // h range over hidden layer 
-                        if(first == true) 
-                        {
-                            c.m_blockVal[pos] = alpha*lhs(h, j)*val;
-                        } else 
-                        {
-                            c.m_blockVal[pos] += alpha*lhs(h, j)*val;
-                        }
-                        pos++;
-                    }
-                }
-            }   
-            c.m_nz = c.m_blockSize * m;
-            if(c.m_nz > c.GetSizeAllocated()) 
-            {
-                throw std::logic_error("sparse matrix out of range.");
-            }
-            //c.SetFormat(matrixFormatSparseBlockCol);
-        }
-        else if (transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
-    {
-        if (lhs.IsEmpty() || rhs.IsEmpty()) 
-        {
-            throw std::logic_error("ScaleAndAdd:  one of the input matrix is empty.");
-        }
-
-        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) 
-        {
-            throw std::invalid_argument("CPUSparseMatrix::ScaleAndAdd: The dimensions of a and b must match.");
-        }
-
-        if(lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSC || lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            size_t col_num = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? lhs.GetNumCols(): lhs.GetNumRows();
-            for(size_t j = 0; j < col_num; j++) 
-            {
-                size_t start = lhs.m_compIndex[j];
-                size_t end = lhs.m_compIndex[j + 1];
-                for(size_t p = start; p < end; p++) 
-                {
-                    size_t i = lhs.m_unCompIndex[p];
-                    ElemType val = lhs.m_pArray[p];
-                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
-                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
-                    rhs(r, c) += alpha * val; 
-                }
-            }
-        } 
-        else if (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol || lhs.m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < lhs.m_blockSize; j++) 
-            {
-                size_t i = lhs.m_blockIds[j];
-                size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols();
-                size_t start = j * len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = lhs.m_blockVal[p];
-
-                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    rhs(r, c) += alpha * val; 
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: ScaleAndAdd() Not implemented");
-        }
-    }
-
-
-    // a: H x No: H is hidden layer size and No is mini-batch size
-    // weight: V x H, V is vocab size
-    // label: V x No
-    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
-    // idx2cls: V x 1, mapping from word to class id
-    // etp: V x No, stores predicted values
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropy(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& weight,
-        const CPUSparseMatrix<ElemType> & label, const CPUMatrix<ElemType>& cls, 
-        const CPUMatrix<ElemType>& idx2cls, CPUSparseMatrix<ElemType>& etp, CPUMatrix<ElemType>& entropyScore)
-    {
-        if (a.IsEmpty() || cls.IsEmpty() || label.IsEmpty() || idx2cls.IsEmpty())
-            throw std::logic_error("AssignSoftmaxOf: Matrix a, class, idx2cls or label is empty.");
-
-        if(etp.GetFormat() != MatrixFormat::matrixFormatSparseCSC)
-            throw std::runtime_error("CPUSparseMatrix:: ClassEntropy() only support CSC");  
-
-        size_t nC = cls.GetNumCols();
-        size_t nV = label.GetNumRows() - nC;
-
-        if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows())
-            throw std::logic_error("ClassEntropy: check matrix dimension");
-        
-        //allocate enough memory
-        if(etp.m_elemSizeAllocated < etp.GetNumElements()) 
-        {
-            etp.Resize(etp.GetNumRows(), etp.GetNumCols(), etp.GetNumElements(), true, false);
-        }
-        etp.Reset();
-
-        entropyScore(0, 0) = 0;
-        for(size_t j = 0; j < label.GetNumCols(); j++)
-        {
-            size_t start = label.m_compIndex[j];
-            size_t end = label.m_compIndex[j + 1];
-            for (size_t p = start; p < end; p++)
-            {
-                size_t i = label.m_unCompIndex[p];
-                size_t iStt, iEnd;
-                if (i < nV)
-                {
-                    size_t clsid = (size_t)idx2cls(i, 0);
-                    iStt = (size_t) cls(0, clsid); //class start word id
-                    iEnd = (size_t) cls(1, clsid); //class end word id
-                }
-                else
-                {
-                    iStt = nV;
-                    iEnd = nV + nC;
-                }
-
-                size_t b = etp.m_nz;
-                for(size_t ii = iStt; ii < iEnd; ii++) //ii ranges over sub-vocab or class ids
-                {
-                    ElemType val = 0.0;
-                    foreach_row(rw, a) //rw ranges over hidden units
-                    {
-                        val += weight(ii,rw) * a(rw,j); 
-                    }
-                    etp.SetValue(ii, j, val); 
-                }
-                ElemType maxV = LZERO;
-                for(size_t ii = b; ii < etp.m_nz; ii++)
-                {
-                    maxV = (ElemType) logadd(maxV, etp.m_pArray[ii]);
-                }
-
-                for(size_t ii = b; ii < etp.m_nz; ii++)
-                {
-                    etp.m_pArray[ii] = etp.m_pArray[ii] - maxV;
-                }
-
-                entropyScore(0, 0) -= etp.m_pArray[b+i-iStt];
-                //negate positive data points
-                etp.m_pArray[b+i-iStt] *=-1;
-            }
-        }
-    }
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyError(CPUSparseMatrix<ElemType>& a)
-    {        
-        for(int i = 0; i < a.m_nz; i++) 
-        {
-            if(a.m_pArray[i] < 0) 
-            {
-                a.m_pArray[i] = exp(a.m_pArray[i]); //negative;
-            } 
-            else 
-            { 
-                a.m_pArray[i] = exp(-a.m_pArray[i])-1; //positive
-            }
-        }       
-    }
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(
-        const CPUSparseMatrix<ElemType>& error, 
-        const CPUMatrix<ElemType>& weight,
-        CPUMatrix<ElemType>& grd) 
-    {
-        grd.SetValue(0);
-
-        for(size_t j = 0; j < error.GetNumCols(); j++) 
-        {
-            size_t start = error.m_compIndex[j];
-            size_t end = error.m_compIndex[j+1];
-            for(size_t p = start; p < end; p++)
-            {
-                size_t i = error.m_unCompIndex[p];
-                for(size_t h = 0; h < grd.GetNumRows(); h++)
-                { // h ranges over hidden units
-                    grd(h,j) += weight(i, h) * error.m_pArray[p];
-                }
-            }
-        }
-    }
-
-
-
-    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(
-        const CPUSparseMatrix<ElemType>& error, 
-        const CPUMatrix<ElemType>& input,
-        const CPUSparseMatrix<ElemType> & /*label*/,
-        const CPUMatrix<ElemType>& /*cls*/, 
-        const CPUMatrix<ElemType>& /*idx2cls*/,
-        CPUSparseMatrix<ElemType>& grd) 
-    {   
-        grd.SetFormat(matrixFormatSparseBlockRow);
-        //allocate enough memory
-        grd.Resize(grd.GetNumRows(), grd.GetNumCols(), error.m_nz*input.GetNumRows(), true, false);
-
-        grd.Reset();
-        map<size_t, size_t> w2Id;
-        for(size_t j = 0; j < error.GetNumCols(); j++)
-        {
-            size_t start = error.m_compIndex[j];
-            size_t end = error.m_compIndex[j+1];
-
-            for(size_t p = start; p < end; p++)
-            {
-                size_t i = error.m_unCompIndex[p]; // i ranges over words
-                bool first = true;
-                if(w2Id.find(i) == w2Id.end()) 
-                {
-                    w2Id[i] = w2Id.size();
-                    grd.m_blockIds[grd.m_blockSize]=i;
-                    grd.m_blockSize++;
-                } 
-                else 
-                {
-                    first = false;
-                }
-                size_t pos = w2Id[i]*input.GetNumRows();
-                for(size_t h = 0; h < input.GetNumRows(); h++)
-                { // h range over hidden layer 
-                    if(first == true) 
-                    {
-                        grd.m_blockVal[pos] = input(h, j)*error.m_pArray[p];
-                    } 
-                    else 
-                    {
-                        grd.m_blockVal[pos] += input(h, j)*error.m_pArray[p];
-                    }
-                    pos++;
-                }
-            }
-        }
-        grd.m_nz = grd.m_blockSize * input.GetNumRows();
-        if(grd.m_nz > grd.GetSizeAllocated()) 
-        {
-            throw std::logic_error("sparse matrix out of range.");
-        }
-        //grd.SetFormat(matrixFormatSparseBlockRow);
-    }
-
-    // normal update for smoothed gradients c and current gradients (this)
-    template<class ElemType> 
-    void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum)
-    {
-        if (c.IsEmpty())
-        {
-            c.Resize(GetNumRows(), GetNumCols());
-            c.SetValue(0.0);
-        }
-
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t i = m_blockIds[j];
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = m_blockVal[p];
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    c(row, col) = (1-momentum)*val + momentum*c(row, col);
-                    m_blockVal[p] = c(row, col);
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: NormalGrad() only support block sparse format");
-        }
-    }
-
-    // update smoothed gradients c and current gradients (this)
-    template<class ElemType> 
-    void CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c)
-    {
-        if (c.IsEmpty())
-        {
-            c.Resize(GetNumRows(), GetNumCols());
-            c.SetValue(0.0);
-        }
-
-        const ElemType floor = 1e-16f;
-        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
-        {
-            size_t col_num = (m_format == MatrixFormat::matrixFormatSparseCSC) ? GetNumCols() : GetNumRows();
-            for(size_t j = 0; j < col_num; j++) 
-            {
-                size_t start = m_compIndex[j];
-                size_t end = m_compIndex[j+1];
-                for(size_t p = start; p < end; p++) 
-                {
-                    size_t i = m_unCompIndex[p];
-                    ElemType val = m_pArray[p];
-
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
-                    ElemType adenorm = c(row, col); 
-                    adenorm += val * val; 
-                    val = val / (floor + sqrt(adenorm)); 
-                    m_pArray[p] = val;
-                    c(row, col) = adenorm; 
-                }
-            }
-        } else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            for(size_t j = 0; j < m_blockSize; j++)
-            {
-                size_t i = m_blockIds[j];
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for(size_t p = start; p < start+len; p++) 
-                {
-                    ElemType val = m_blockVal[p];
-
-                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
-                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
-                    ElemType adenorm = c(row, col); 
-                    adenorm += val * val; 
-                    val = val / (floor + sqrt(adenorm)); 
-                    m_blockVal[p] = val;
-                    c(row, col) = adenorm; 
-                }
-            }
-        } 
-    }
-
-    template<class ElemType>
-    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
-    {
-        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-        {
-            ElemType locThresholdPos = abs(threshold);
-            ElemType locTHresholdNeg = -locThresholdPos; 
-
-            for(size_t j = 0; j < m_blockSize; j++) 
-            {
-                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
-                size_t start = j* len;
-                for (size_t p = start; p < start+len; p++)
-                {
-                    if (m_blockVal[p] > locThresholdPos)
-                    {
-                        m_blockVal[p] = locThresholdPos;
-                    }
-                    else if (m_blockVal[p] < locTHresholdNeg)
-                    {
-                        m_blockVal[p] = locTHresholdNeg;
-                    }
-                }
-            }
-        } 
-        else 
-        {
-            throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix");
-        }
-        return *this;
-    }    
-
-    template <class ElemType>
-    MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
-    {
-        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        size_t elsize;
-        stream >> elsize;
-        if (sizeof(ElemType) != elsize)
-            throw std::runtime_error("Template argument size doesn't match those in file");
-        std::wstring matrixName;
-
-        // now prepare this header to receive the data being read
-        size_t nz, colnum, rownum;
-        int format;
-
-        // read in the header information
-        stream >> matrixName >> format >> nz >> colnum >> rownum;
-
-        us.SetFormat((MatrixFormat)format);
-        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        us.Resize(rownum, colnum, nz);
-
-        if (nz > 0)
-        {
-            size_t compressedSize = (us.GetFormat() == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1;
-            ElemType* dataBuffer = us.NzValues();
-            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
-            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
-
-            // read in the sparse matrix info
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream >> dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream >> unCompressedIndex[i];
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                stream >> compressedIndex[i];
-            }
-        }
-        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-
-        us.SetMatrixName(matrixName.c_str());
-
-        return stream;
-    }
-
-    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
-    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);
-
-    template <class ElemType>
-    MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
-    {
-        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        stream << sizeof(ElemType);
-        if (us.GetMatrixName() == nullptr)
-        {
-            std::wstring s(L"nnmatrix");
-            stream << s;
-        }
-        else
-        {
-            stream << us.GetMatrixName();
-        }
-
-        size_t nz, numRows, numCols;
-        size_t compressedSize = us.SecondaryIndexCount();
-        int format = us.GetFormat();
-
-        stream << format << nz << numCols << numRows;
-
-        if (nz > 0)
-        {
-            ElemType* dataBuffer = us.NzValues();
-            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
-            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
-
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream << dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream << unCompressedIndex[i];
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                stream << compressedIndex[i];
-            }
-        }
-        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-
-        return stream;
-    }
-
-    template class CPUSparseMatrix<float>;
-    template class CPUSparseMatrix<double>;
-
-}}}
+//
+// <copyright file="CPUSparseMatrix.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// Math.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#include <assert.h>
+#include <stdexcept>
+#include <omp.h>
+#include <math.h>
+#include "CPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include <random>
+#include <chrono>
+#ifdef    _WIN32
+#include <Windows.h>
+#endif
+#ifdef LEAKDETECT
+#include <vld.h>
+#endif
+
+#include "basetypes.h"
+#include "fileutil.h"
+
+
+#ifndef USE_MKL
+// use ACML as default. 
+// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above 
+// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
+// Install the ifort64 variant (compiled with intel compiler) of the library
+// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
+// to point to your folder for the include file and link library
+#include <acml.h>  // requires ACML 5.3.0 and above
+#else
+// requires MKL 10.0 and above
+#endif
+
+// This is an example of an exported variable
+//MATH_API int nMath=0;
+
+// This is an example of an exported function.
+//MATH_API int fnMath(void)
+//{
+//    return 42;
+//}
+
+#ifndef USE_MKL  //MKL has one additional parameter for different matrix order
+#define BLAS_COLMAJOR 
+#else
+#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, 
+#endif
+
+#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);}
+#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+#pragma region Helpful Enum Definitions
+    enum class MatrixOrder
+    {
+        RowMajor = 101,  // row-major arrays 
+        ColMajor = 102  // column-major arrays 
+    };
+
+    enum class MatrixTranspose : char
+    {
+        NoTrans = 'N', // trans='N'
+        Trans = 'T', // trans='T' 
+        ConjTrans = 'C' // trans='C'
+    };
+
+    enum class SymMatrixType : char
+    {
+        Up = 'U', // symmetric matrix is stored in the upper part
+        Low = 'L', // symmetric matrix is stored in thelower part
+        Full = 'F', //full populated
+        NotSymmetric = 'N' //not a symmetric matrix
+    };
+
+    enum class MatrixOpSide : char
+    {
+        Left = 'L', // left multiply
+        Right = 'R', // right multiply
+    };
+#pragma endregion Helpful Enum Definitions
+
+#pragma region Constructors and Destructor
+
+    //should only be used by constructors.
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ZeroInit()
+    {   
+        m_numRows = 0;
+        m_numCols = 0;
+        m_elemSizeAllocated = 0;
+        m_compIndexSize = 0;
+        m_externalBuffer = false;
+        m_computeDevice = CPUDEVICE;
+        m_nz = 0;
+        m_matrixName = NULL;   
+
+        //if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
+        {
+            m_colIdx = -1;
+            m_pArray = NULL;
+            m_unCompIndex = NULL;
+            m_compIndex = NULL;
+        } 
+        //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            m_blockSize = 0;      
+            m_pArray = NULL;
+            m_blockIds = NULL;
+        }
+    }
+
+    //should only be used by constructors.
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
+    {
+        if (format != MatrixFormat::matrixFormatSparseCSC && format != MatrixFormat::matrixFormatSparseCSR && format != MatrixFormat::matrixFormatSparseBlockCol && format != MatrixFormat::matrixFormatSparseBlockRow)
+        {
+            throw std::logic_error("CPUSparseMatrix:  unsupported sparse matrix format");
+        }
+        m_format = format;
+        m_default = defaultElem();
+        ZeroInit();
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
+    {
+        CheckInit(format);
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
+    {
+        CheckInit(format);
+        Resize(numRows, numCols, size);
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
+    {       
+        if (m_matrixName!=NULL) 
+        {
+            delete[] m_matrixName;
+            m_matrixName = nullptr;
+        }
+        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
+        {
+            if(m_pArray != NULL) 
+                delete[] m_pArray;
+            if(m_unCompIndex != NULL) 
+                delete[] m_unCompIndex;
+            if(m_compIndex != NULL)
+                delete[] m_compIndex;
+        }  
+        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            if (m_pArray != NULL)
+                delete[] m_pArray;
+            if(m_blockIds != NULL) 
+                delete[] m_blockIds;
+        }
+    }
+
+
+
+#pragma endregion Constructors and Destructor
+
+#pragma region Basic Operators
+
+    //make sure call order in colume wise for CSC and row wise for CSR
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, const ElemType v)
+    {
+        if(m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseCSR) 
+        {
+            throw std::logic_error("CPUSparseMatrix:  unsupported SetValue() call.");
+        }
+
+        if(m_elemSizeAllocated < m_nz +1) //automatic resize
+        {
+            Resize(m_numRows, m_numCols, m_nz + 100);  //allocate 100 more elelemnts and keep existing values
+        }
+
+        if(row < 0 || row >= m_numRows) 
+        {
+            throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id");
+        }
+
+        if(col < 0 || col >= m_numCols) {
+            throw std::logic_error("CPUSparseMatrix: SetValue() invalid column id");
+        }
+
+        size_t r = (m_format == matrixFormatSparseCSC) ? row: col;
+        size_t c = (m_format == matrixFormatSparseCSC) ? col: row;
+
+        m_pArray[m_nz] = v;
+        m_unCompIndex[m_nz] = (CPUSPARSE_INDEX_TYPE)r;
+
+        //consistency check
+        if(c == m_colIdx && r <= m_unCompIndex[m_nz-1]) 
+        {
+            throw std::logic_error("CPUSparseMatrix:  SetValue is not called properly");
+        }
+
+        if (c != m_colIdx) 
+        {
+            m_compIndex[c] = CPUSPARSE_INDEX_TYPE(m_nz);
+            m_colIdx = (int) c;
+        } 
+        m_compIndex[c + 1] = CPUSPARSE_INDEX_TYPE(m_nz + 1);
+        m_nz++;
+    }
+
+    template<class ElemType>
+    ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
+    {
+        return m_pArray;
+    }
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
+    {               
+        size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
+        bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize);
+
+        m_numRows = numRows;
+        m_numCols = numCols;
+
+        if (reallocate)
+        {                
+            if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
+            {
+                ElemType *pArray = new ElemType[numNZElemToReserve];
+                CPUSPARSE_INDEX_TYPE *unCompIndex = new CPUSPARSE_INDEX_TYPE[numNZElemToReserve];
+                CPUSPARSE_INDEX_TYPE *compIndex = new CPUSPARSE_INDEX_TYPE[newCompIndexSize];
+                
+                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
+                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
+
+                if (keepExistingValues && m_nz > 0)
+                {
+                    assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve);
+                    memcpy(pArray, m_pArray, NzSize());
+                    memcpy(unCompIndex, m_unCompIndex, MajorIndexSize());
+                    memcpy(compIndex, m_compIndex, SecondaryIndexSize());
+                }
+
+                if (m_pArray != NULL)
+                    delete[] m_pArray;
+                if (m_unCompIndex != NULL)
+                    delete[] m_unCompIndex;
+                if (m_compIndex != NULL)
+                    delete[] m_compIndex;
+
+                m_pArray = pArray;
+                m_unCompIndex = unCompIndex;
+                m_compIndex = compIndex;
+            }
+            else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+            {
+                ElemType *blockVal = new ElemType[numNZElemToReserve];
+                size_t *blockIds = new size_t[newCompIndexSize];
+
+                if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize))
+                    throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize");
+
+                if (keepExistingValues && m_elemSizeAllocated > 0)
+                {
+                    assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
+                    memcpy(blockVal, m_pArray, NzSize());
+                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
+                }
+
+                if (m_pArray != NULL)
+                    delete[] m_pArray;
+                if(m_blockIds != NULL) 
+                    delete[] m_blockIds;
+
+                m_pArray = blockVal;
+                m_blockIds = blockIds;
+            }
+
+            m_elemSizeAllocated = numNZElemToReserve;
+            m_compIndexSize = newCompIndexSize;
+        }
+    }
+
+    //Reset matrix so it can be reused
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::Reset()
+    {                
+        m_nz = 0;
+        m_colIdx = -1;
+        m_blockSize = 0;
+    }
+
+    //c = alpha*op(lhs) * op(rhs) + beta*c
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
+        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
+
+    {
+        if (lhs.IsEmpty() || rhs.IsEmpty())
+            throw std::logic_error("MultiplyAndWeightedAdd:  one of the input matrix is empty.");
+
+        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
+        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
+        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
+        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
+
+        assert (m>0 && k>0 && l>0 && n>0);  //converting from size_t to int may cause overflow
+        assert (k == l);
+        if (k != l) 
+        {
+            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match.");
+        }
+
+        if (c.GetNumRows() != m || c.GetNumCols() != n) 
+        {
+            c.Resize(m,n);
+        }         
+
+        if (beta == 0)
+        {
+            memset(c.GetArray(), 0, sizeof(ElemType) * c.GetNumElements());
+        }
+        else if (beta != 1)
+        {
+#pragma omp parallel for
+            foreach_coord(i,j,c)
+            {
+                c(i,j) = beta * c(i,j); 
+            } 
+        }
+
+        if (rhs.GetFormat() != matrixFormatSparseCSC)
+            NOT_IMPLEMENTED;
+
+        if (!transposeA && !transposeB)
+        {
+            for(size_t j = 0; j < rhs.GetNumCols(); j++) 
+            {
+                size_t start = rhs.m_compIndex[j];  //ColLocation
+                size_t end = rhs.m_compIndex[j+1];
+                for(size_t p = start; p < end; p++)
+                { 
+                    size_t i = rhs.m_unCompIndex[p]; //RowLocation
+                    ElemType val = rhs.m_pArray[p];
+
+                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
+                    {
+                        c(h,j) += alpha * lhs(h, i)*val; 
+                    }
+                }
+            }
+        }
+        else if (!transposeA && transposeB)
+        {           
+            for(size_t j = 0; j < rhs.GetNumCols(); j++)
+            { 
+                size_t start = rhs.m_compIndex[j];
+                size_t end = rhs.m_compIndex[j + 1];
+
+                for(size_t p = start; p < end; p++)
+                { 
+                    size_t i = rhs.m_unCompIndex[p];
+                    ElemType val = rhs.m_pArray[p];
+                    for(size_t h = 0; h < lhs.GetNumRows(); h++)
+                    {                     
+                        c(h, i) += alpha * lhs(h, j)*val;
+                    }
+                }
+            }           
+        }
+        else if (transposeA && !transposeB)
+        {
+            NOT_IMPLEMENTED;
+        }
+        else 
+        {
+            NOT_IMPLEMENTED;
+        }
+    }
+
+    //c = alpha * op(lhs) * op(rhs)
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA, 
+        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
+    {
+        if (lhs.IsEmpty() || rhs.IsEmpty())
+            throw std::logic_error("LeftMultiplyAndAdd:  one of the input matrix is empty.");
+
+        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
+        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
+        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
+        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
+
+        assert (m>0 && k>0 && l>0 && n>0); m; n;  //converting from size_t to int may cause overflow
+        assert (k == l);
+        if (k != l) 
+        {
+            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match.");
+        }
+
+        c.Reset();
+
+        if (!transposeA && !transposeB)
+        {
+            NOT_IMPLEMENTED;
+        }
+        else if (!transposeA && transposeB)
+        {           
+            if (rhs.GetFormat() != matrixFormatSparseCSC)
+                NOT_IMPLEMENTED;
+
+            //allocate enough memory
+            c.SetFormat(matrixFormatSparseBlockCol);
+            c.Resize(m, n, m*min(n, rhs.m_nz));
+
+            map<size_t, size_t> w2Id;
+            for(size_t j = 0; j < rhs.GetNumCols(); j++)
+            { // j ranges over batches
+                size_t start = rhs.m_compIndex[j];
+                size_t end = rhs.m_compIndex[j+1];
+
+                for(size_t p = start; p < end; p++) 
+                { 
+                    size_t i = rhs.m_unCompIndex[p]; //i ranges over words
+                    ElemType val = rhs.m_pArray[p]; //1 for(i, j)
+
+                    bool first = true;
+                    if(w2Id.find(i) == w2Id.end()) 
+                    {
+                        w2Id[i] = w2Id.size();
+                        c.m_blockIds[c.m_blockSize]=i;
+                        c.m_blockSize++;
+                    } 
+                    else 
+                    {
+                        first = false;
+                    }
+                    size_t pos = w2Id[i] * lhs.GetNumRows();
+                    for(size_t h = 0; h < lhs.GetNumRows(); h++) 
+                    { // h range over hidden layer 
+                        if(first == true) 
+                        {
+                            c.m_pArray[pos] = alpha*lhs(h, j)*val;
+                        } else 
+                        {
+                            c.m_pArray[pos] += alpha*lhs(h, j)*val;
+                        }
+                        pos++;
+                    }
+                }
+            }   
+            c.m_nz = c.m_blockSize * m;
+            if(c.m_nz > c.GetSizeAllocated()) 
+            {
+                throw std::logic_error("sparse matrix out of range.");
+            }
+            //c.SetFormat(matrixFormatSparseBlockCol);
+        }
+        else if (transposeA && !transposeB)
+        {
+            NOT_IMPLEMENTED;
+        }
+        else 
+        {
+            NOT_IMPLEMENTED;
+        }
+    }
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
+    {
+        if (lhs.IsEmpty() || rhs.IsEmpty()) 
+        {
+            throw std::logic_error("ScaleAndAdd:  one of the input matrix is empty.");
+        }
+
+        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) 
+        {
+            throw std::invalid_argument("CPUSparseMatrix::ScaleAndAdd: The dimensions of a and b must match.");
+        }
+
+        if(lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSC || lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSR) 
+        {
+            size_t col_num = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? lhs.GetNumCols(): lhs.GetNumRows();
+            for(size_t j = 0; j < col_num; j++) 
+            {
+                size_t start = lhs.m_compIndex[j];
+                size_t end = lhs.m_compIndex[j + 1];
+                for(size_t p = start; p < end; p++) 
+                {
+                    size_t i = lhs.m_unCompIndex[p];
+                    ElemType val = lhs.m_pArray[p];
+                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
+                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
+                    rhs(r, c) += alpha * val; 
+                }
+            }
+        } 
+        else if (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol || lhs.m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            for(size_t j = 0; j < lhs.m_blockSize; j++) 
+            {
+                size_t i = lhs.m_blockIds[j];
+                size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols();
+                size_t start = j * len;
+                for(size_t p = start; p < start+len; p++) 
+                {
+                    ElemType val = lhs.m_pArray[p];
+
+                    size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
+                    size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
+                    rhs(r, c) += alpha * val; 
+                }
+            }
+        } 
+        else 
+        {
+            throw std::runtime_error("CPUSparseMatrix:: ScaleAndAdd() Not implemented");
+        }
+    }
+
+
+    template<class ElemType>
+    bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AreEqual: one of the input matrices is empty.");
+
+        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+
+        bool result = true;
+
+        #pragma omp parallel for
+        foreach_coord(i, j, a)
+        {
+            if (abs(a(i, j) - b(i, j)) > threshold)
+            {
+                result = false;
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    // a: H x No: H is hidden layer size and No is mini-batch size
+    // weight: V x H, V is vocab size
+    // label: V x No
+    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
+    // idx2cls: V x 1, mapping from word to class id
+    // etp: V x No, stores predicted values
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ClassEntropy(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& weight,
+        const CPUSparseMatrix<ElemType> & label, const CPUMatrix<ElemType>& cls, 
+        const CPUMatrix<ElemType>& idx2cls, CPUSparseMatrix<ElemType>& etp, CPUMatrix<ElemType>& entropyScore)
+    {
+        if (a.IsEmpty() || cls.IsEmpty() || label.IsEmpty() || idx2cls.IsEmpty())
+            throw std::logic_error("AssignSoftmaxOf: Matrix a, class, idx2cls or label is empty.");
+
+        if(etp.GetFormat() != MatrixFormat::matrixFormatSparseCSC)
+            throw std::runtime_error("CPUSparseMatrix:: ClassEntropy() only support CSC");  
+
+        size_t nC = cls.GetNumCols();
+        size_t nV = label.GetNumRows() - nC;
+
+        if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows())
+            throw std::logic_error("ClassEntropy: check matrix dimension");
+        
+        //allocate enough memory
+        if(etp.m_elemSizeAllocated < etp.GetNumElements()) 
+        {
+            etp.Resize(etp.GetNumRows(), etp.GetNumCols(), etp.GetNumElements(), true, false);
+        }
+        etp.Reset();
+
+        entropyScore(0, 0) = 0;
+        for(size_t j = 0; j < label.GetNumCols(); j++)
+        {
+            size_t start = label.m_compIndex[j];
+            size_t end = label.m_compIndex[j + 1];
+            for (size_t p = start; p < end; p++)
+            {
+                size_t i = label.m_unCompIndex[p];
+                size_t iStt, iEnd;
+                if (i < nV)
+                {
+                    size_t clsid = (size_t)idx2cls(i, 0);
+                    iStt = (size_t) cls(0, clsid); //class start word id
+                    iEnd = (size_t) cls(1, clsid); //class end word id
+                }
+                else
+                {
+                    iStt = nV;
+                    iEnd = nV + nC;
+                }
+
+                size_t b = etp.m_nz;
+                for(size_t ii = iStt; ii < iEnd; ii++) //ii ranges over sub-vocab or class ids
+                {
+                    ElemType val = 0.0;
+                    foreach_row(rw, a) //rw ranges over hidden units
+                    {
+                        val += weight(ii,rw) * a(rw,j); 
+                    }
+                    etp.SetValue(ii, j, val); 
+                }
+                ElemType maxV = LZERO;
+                for(size_t ii = b; ii < etp.m_nz; ii++)
+                {
+                    maxV = (ElemType) logadd(maxV, etp.m_pArray[ii]);
+                }
+
+                for(size_t ii = b; ii < etp.m_nz; ii++)
+                {
+                    etp.m_pArray[ii] = etp.m_pArray[ii] - maxV;
+                }
+
+                entropyScore(0, 0) -= etp.m_pArray[b+i-iStt];
+                //negate positive data points
+                etp.m_pArray[b+i-iStt] *=-1;
+            }
+        }
+    }
+
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ClassEntropyError(CPUSparseMatrix<ElemType>& a)
+    {        
+        for(int i = 0; i < a.m_nz; i++) 
+        {
+            if(a.m_pArray[i] < 0) 
+            {
+                a.m_pArray[i] = exp(a.m_pArray[i]); //negative;
+            } 
+            else 
+            { 
+                a.m_pArray[i] = exp(-a.m_pArray[i])-1; //positive
+            }
+        }       
+    }
+
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(
+        const CPUSparseMatrix<ElemType>& error, 
+        const CPUMatrix<ElemType>& weight,
+        CPUMatrix<ElemType>& grd) 
+    {
+        grd.SetValue(0);
+
+        for(size_t j = 0; j < error.GetNumCols(); j++) 
+        {
+            size_t start = error.m_compIndex[j];
+            size_t end = error.m_compIndex[j+1];
+            for(size_t p = start; p < end; p++)
+            {
+                size_t i = error.m_unCompIndex[p];
+                for(size_t h = 0; h < grd.GetNumRows(); h++)
+                { // h ranges over hidden units
+                    grd(h,j) += weight(i, h) * error.m_pArray[p];
+                }
+            }
+        }
+    }
+
+
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(
+        const CPUSparseMatrix<ElemType>& error, 
+        const CPUMatrix<ElemType>& input,
+        const CPUSparseMatrix<ElemType> & /*label*/,
+        const CPUMatrix<ElemType>& /*cls*/, 
+        const CPUMatrix<ElemType>& /*idx2cls*/,
+        CPUSparseMatrix<ElemType>& grd) 
+    {   
+        grd.SetFormat(matrixFormatSparseBlockRow);
+        //allocate enough memory
+        grd.Resize(grd.GetNumRows(), grd.GetNumCols(), error.m_nz*input.GetNumRows(), true, false);
+
+        grd.Reset();
+        map<size_t, size_t> w2Id;
+        for(size_t j = 0; j < error.GetNumCols(); j++)
+        {
+            size_t start = error.m_compIndex[j];
+            size_t end = error.m_compIndex[j+1];
+
+            for(size_t p = start; p < end; p++)
+            {
+                size_t i = error.m_unCompIndex[p]; // i ranges over words
+                bool first = true;
+                if(w2Id.find(i) == w2Id.end()) 
+                {
+                    w2Id[i] = w2Id.size();
+                    grd.m_blockIds[grd.m_blockSize]=i;
+                    grd.m_blockSize++;
+                } 
+                else 
+                {
+                    first = false;
+                }
+                size_t pos = w2Id[i]*input.GetNumRows();
+                for(size_t h = 0; h < input.GetNumRows(); h++)
+                { // h range over hidden layer 
+                    if(first == true) 
+                    {
+                        grd.m_pArray[pos] = input(h, j)*error.m_pArray[p];
+                    } 
+                    else 
+                    {
+                        grd.m_pArray[pos] += input(h, j)*error.m_pArray[p];
+                    }
+                    pos++;
+                }
+            }
+        }
+        grd.m_nz = grd.m_blockSize * input.GetNumRows();
+        if(grd.m_nz > grd.GetSizeAllocated()) 
+        {
+            throw std::logic_error("sparse matrix out of range.");
+        }
+        //grd.SetFormat(matrixFormatSparseBlockRow);
+    }
+
+    // normal update for smoothed gradients c and current gradients (this)
+    template<class ElemType> 
+    void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum)
+    {
+        if (c.IsEmpty())
+        {
+            c.Resize(GetNumRows(), GetNumCols());
+            c.SetValue(0.0);
+        }
+
+        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            for(size_t j = 0; j < m_blockSize; j++) 
+            {
+                size_t i = m_blockIds[j];
+                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
+                size_t start = j* len;
+                for(size_t p = start; p < start+len; p++) 
+                {
+                    ElemType val = m_pArray[p];
+                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
+                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
+                    c(row, col) = (1-momentum)*val + momentum*c(row, col);
+                    m_pArray[p] = c(row, col);
+                }
+            }
+        } 
+        else 
+        {
+            throw std::runtime_error("CPUSparseMatrix:: NormalGrad() only support block sparse format");
+        }
+    }
+
+    // update smoothed gradients c and current gradients (this)
+    template<class ElemType> 
+    void CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c)
+    {
+        if (c.IsEmpty())
+        {
+            c.Resize(GetNumRows(), GetNumCols());
+            c.SetValue(0.0);
+        }
+
+        const ElemType floor = 1e-16f;
+        if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
+        {
+            size_t col_num = (m_format == MatrixFormat::matrixFormatSparseCSC) ? GetNumCols() : GetNumRows();
+            for(size_t j = 0; j < col_num; j++) 
+            {
+                size_t start = m_compIndex[j];
+                size_t end = m_compIndex[j+1];
+                for(size_t p = start; p < end; p++) 
+                {
+                    size_t i = m_unCompIndex[p];
+                    ElemType val = m_pArray[p];
+
+                    size_t row = (m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j;
+                    size_t col = (m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i;
+                    ElemType adenorm = c(row, col); 
+                    adenorm += val * val; 
+                    val = val / (floor + sqrt(adenorm)); 
+                    m_pArray[p] = val;
+                    c(row, col) = adenorm; 
+                }
+            }
+        } else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            for(size_t j = 0; j < m_blockSize; j++)
+            {
+                size_t i = m_blockIds[j];
+                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
+                size_t start = j* len;
+                for(size_t p = start; p < start+len; p++) 
+                {
+                    ElemType val = m_pArray[p];
+
+                    size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
+                    size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
+                    ElemType adenorm = c(row, col); 
+                    adenorm += val * val; 
+                    val = val / (floor + sqrt(adenorm)); 
+                    m_pArray[p] = val;
+                    c(row, col) = adenorm; 
+                }
+            }
+        } 
+    }
+
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
+    {
+        if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
+        {
+            ElemType locThresholdPos = abs(threshold);
+            ElemType locTHresholdNeg = -locThresholdPos; 
+
+            for(size_t j = 0; j < m_blockSize; j++) 
+            {
+                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
+                size_t start = j* len;
+                for (size_t p = start; p < start+len; p++)
+                {
+                    if (m_pArray[p] > locThresholdPos)
+                    {
+                        m_pArray[p] = locThresholdPos;
+                    }
+                    else if (m_pArray[p] < locTHresholdNeg)
+                    {
+                        m_pArray[p] = locTHresholdNeg;
+                    }
+                }
+            }
+        } 
+        else 
+        {
+            throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix");
+        }
+        return *this;
+    }    
+
+    template <class ElemType>
+    MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
+    {
+        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
+        size_t elsize;
+        stream >> elsize;
+        if (sizeof(ElemType) != elsize)
+            throw std::runtime_error("Template argument size doesn't match those in file");
+        std::wstring matrixName;
+
+        // now prepare this header to receive the data being read
+        size_t nz, colnum, rownum;
+        int format;
+
+        // read in the header information
+        stream >> matrixName >> format >> nz >> colnum >> rownum;
+
+        us.SetFormat((MatrixFormat)format);
+        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        us.Resize(rownum, colnum, nz);
+
+        if (nz > 0)
+        {
+            size_t compressedSize = (us.GetFormat() == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1;
+            ElemType* dataBuffer = us.NzValues();
+            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
+            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
+
+            // read in the sparse matrix info
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream >> dataBuffer[i];
+            }
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream >> unCompressedIndex[i];
+            }
+            for (size_t i = 0; i < compressedSize; ++i)
+            {
+                stream >> compressedIndex[i];
+            }
+        }
+        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
+
+        us.SetMatrixName(matrixName.c_str());
+
+        return stream;
+    }
+
+    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
+    template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);
+
+    template <class ElemType>
+    MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
+    {
+        if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
+        stream << sizeof(ElemType);
+        if (us.GetMatrixName() == nullptr)
+        {
+            std::wstring s(L"nnmatrix");
+            stream << s;
+        }
+        else
+        {
+            stream << us.GetMatrixName();
+        }
+
+        size_t nz, numRows, numCols;
+        size_t compressedSize = us.SecondaryIndexCount();
+        int format = us.GetFormat();
+
+        stream << format << nz << numCols << numRows;
+
+        if (nz > 0)
+        {
+            ElemType* dataBuffer = us.NzValues();
+            CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
+            CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
+
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream << dataBuffer[i];
+            }
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream << unCompressedIndex[i];
+            }
+            for (size_t i = 0; i < compressedSize; ++i)
+            {
+                stream << compressedIndex[i];
+            }
+        }
+        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
+
+        return stream;
+    }
+
+    template class CPUSparseMatrix<float>;
+    template class CPUSparseMatrix<double>;
+
+}}}
diff --git a/Math/Math/CPUSparseMatrix.h b/Math/Math/CPUSparseMatrix.h
index 43885f928..aac886ec1 100644
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@@ -33,7 +33,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     public:
         CPUSparseMatrix(const MatrixFormat format);
         CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
-
+        
+        
         ~CPUSparseMatrix();
 
     public:
@@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         
         static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);
 
+        static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+
         /// sum(vec(a).*vec(b))
         static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/) { NOT_IMPLEMENTED; }
         
@@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
         void Reset();
 
+        inline ElemType defaultElem()
+        {
+            ElemType default;
+            memset(&default, 0, sizeof(ElemType));
+            return default;
+        }
+
+        const ElemType& operator() (const size_t row, const size_t col) const
+        {
+            if (col >= m_numCols || row >= m_numRows)
+            {
+                throw std::runtime_error("Position outside matrix dimensions");
+            }
+
+            if (m_format == MatrixFormat::matrixFormatSparseCSC)
+            {
+                size_t start = m_compIndex[col];
+                size_t end = m_compIndex[col + 1];
+                for (size_t p = start; p < end; p++)
+                {
+                    size_t i = m_unCompIndex[p];
+                    if (i == row)
+                    {
+                        return m_pArray[p];
+                    }
+                }
+
+                return m_default;
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+
     public:
         void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
         void Adagrad(CPUMatrix<ElemType>& c);
@@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     public:
         const ElemType* NzValues() const { return m_pArray; }
-        ElemType* NzValues() { return m_pArray; }
+        inline ElemType* NzValues() { return m_pArray; }
         size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
 
         CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
@@ -139,9 +177,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUSPARSE_INDEX_TYPE *m_unCompIndex; //row/col ids in CSC/CSR format
         CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format
 
-        size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
+        size_t m_blockSize; //block size
         size_t *m_blockIds; //block ids
+
+        ElemType m_default;
     };
 
     typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 0f2359078..bd6089240 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -1,3479 +1,3440 @@
-//
-// <copyright file="GPUMatrix.cu" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#include "stdafx.h"
-#include "BestGpu.h"
-
-#ifndef CPUONLY
-
-#include "cublas_v2.h"
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include "device_launch_parameters.h"
-#include "GPUMatrix.h"
-#include "GPUMatrixCUDAKernels.cu"
-#include "GPUSparseMatrix.h"
-#include <iostream> // for cout
-
-#pragma comment (lib, "cudart.lib")     // instruct linker to reference these libs
-#pragma comment (lib, "cublas.lib")
-#pragma comment (lib, "cusparse.lib")
-#pragma comment (lib, "curand.lib")
-
-#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
-#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
-#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
-
-#ifdef NO_SYNC
-bool do_sync = false;
-#else
-bool do_sync = true;
-#endif
-
-#ifdef _WIN32
-// thread local storage to access the current stream, initalize to default stream
-__declspec (thread) 
-#endif
-cudaStream_t t_stream = cudaStreamDefault;
-
-extern int _ConvertSMVer2Cores(int major, int minor);   // forward declaration
-
-// SetStream - set the stream that will be used by the GPU routines
-void MATH_API SetStream(cudaStream_t stream)
-{
-    t_stream = stream;
-}
-
-// GetStream - get the stream that will be used by the GPU routines
-cudaStream_t MATH_API GetStream()
-{
-    return t_stream;
-}
-
-
-void CURAND_CALL(curandStatus x)
-{
-    if(x!=CURAND_STATUS_SUCCESS) 
-    { 
-        throw std::runtime_error("CURAND fail");
-    }        
-}
-
-void CUBLAS_CALL(cublasStatus_t x)
-{
-    if(x!=CUBLAS_STATUS_SUCCESS) 
-    { 
-        throw std::runtime_error("CUBLAS fail");
-    }
-}
-
-void CUDA_CALL(cudaError_t x) 
-{
-    if(x!=cudaSuccess) 
-    { 
-        const char* errmsg = cudaGetErrorString(x);
-        std::cerr << "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl;
-        cudaDeviceSynchronize();
-        throw std::runtime_error(errmsg);
-    }    
-}
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    // PrepareDevice - Setup the correct cuda context for an operation
-    // deviceId - the device on which the operation will take place
-    void PrepareDevice(DEVICEID_TYPE deviceId)
-    {
-        static DEVICEID_TYPE currentDevice = AUTOPLACEMATRIX; // set to anything valid
-        // externally managed matrices are guaranteed to be on the right device
-        if (deviceId == MANAGEDEXTERN)
-            return;
-        // and if we last set the device to be this device we are good
-        if (deviceId == currentDevice)
-            return;
-        CUDA_CALL(cudaSetDevice(deviceId));
-        currentDevice=deviceId;
-    }
-
-#pragma region DeviceBoundNumber class
-
-    template<class ElemType>
-    DeviceBoundNumber<ElemType>::DeviceBoundNumber(const DeviceBoundNumber<ElemType> &/*deepCopy*/)
-    {
-        NOT_IMPLEMENTED;
-    }
-
-    template<class ElemType>
-    DeviceBoundNumber<ElemType>::DeviceBoundNumber(DeviceBoundNumber<ElemType> &&shallowCopy)
-    {
-        ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice);
-        shallowCopy.m_data=NULL;
-    }
-
-    template<class ElemType>
-    void DeviceBoundNumber<ElemType>::ShallowCopyFrom(ElemType* newVal,int newValsDevceId)
-    {
-        m_computeDevice = newValsDevceId;
-        m_data = newVal;
-    }
-
-    template<class ElemType>
-    DeviceBoundNumber<ElemType>::~DeviceBoundNumber()
-    {
-        if (m_data!=NULL)
-        {
-            if (m_computeDevice<0)
-            {
-                delete m_data;
-                m_data = NULL;
-            }
-            else if (m_computeDevice != MANAGEDEXTERN)
-                CUDA_CALL(cudaFree(m_data));
-        }
-    }
-
-#pragma endregion DeviceBoundNumber class
-
-#pragma region Helper functions
-    template<class ElemType>    
-    cublasHandle_t _initCUBLAS(int devId)
-    {
-        PrepareDevice((DEVICEID_TYPE)devId);
-        cublasHandle_t cuHandle;
-        CUBLAS_CALL(cublasCreate(&cuHandle));
-        return cuHandle;
-    }
-
-    // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information
-    //  TODO: should be replaced by BestGpu class instead, it's much better
-    template<class ElemType>
-    DEVICEID_TYPE GPUMatrix<ElemType>::GetBestGPUDeviceId() //returns -1 if no GPUs can be used
-    {      
-        // currently there is little point in giving out different device IDs each time ask for a matrix, 
-        // we really want them all on the same device eventually
-        static int chosenDeviceId = AUTOPLACEMATRIX;
-        if (chosenDeviceId != AUTOPLACEMATRIX)
-            return chosenDeviceId;
-
-        __try
-        {
-            // stash previous device state
-            // if there was one on entry:
-            int nPrevDev = -1;
-            cudaError_t ePrevDev = cudaGetDevice(&nPrevDev);
- 
-            int deviceCount = -1;
-            cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
-            if (error_id != cudaSuccess || deviceCount==0) 
-            { 
-                return -1;            
-            }
-
-            int setDev = -1;
-            int curDev=0;
-            long curPower = 0;
-            for (DEVICEID_TYPE dev = 0; dev < deviceCount; ++dev)
-            {
-                CUDA_CALL(cudaSetDevice(dev));
-                setDev = dev;
-                cudaDeviceProp deviceProp;
-                cudaGetDeviceProperties(&deviceProp, dev);
-                long power = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount;
-                //long power = _GetFreeMemoryOnCUDADevice(dev);
-                if (power>curPower)
-                {
-                    curPower=power;
-                    curDev = dev;
-                }
-            }
-
-            if(nPrevDev >= 0 && ePrevDev == cudaSuccess && 
-                setDev >= 0 && setDev != nPrevDev) {
-                // restore current context to the one we entered with
-                // if there was one the caller might want unchanged.
-                cudaSetDevice(nPrevDev);
-            }
-            chosenDeviceId = curDev;
-            return curDev;
-        }
-        __except (1)
-        {
-            return -1; // CPU
-        }
-    }
-
-    // PrepareDevice - Setup the correct cuda context for an operation
-    // deviceId - the device on which the operation will take place
-    //            defaults to -1, which means use matrices current device
-    template<class ElemType>
-    DEVICEID_TYPE GPUMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const
-    {
-        // if default value use current compute device
-        DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice;
-
-        Microsoft::MSR::CNTK::PrepareDevice(newId);
-        return newId;
-    }
-
-    template<class ElemType>
-    ElemType* GPUMatrix<ElemType>::CopyToArray() const
-    {
-        size_t numElements = GetNumElements();
-        if (numElements != 0)
-        {
-            PrepareDevice();
-            ElemType* pArray = new ElemType[numElements];                    
-            CUDA_CALL(cudaMemcpy(pArray,m_pArray,sizeof(ElemType)*m_numRows*m_numCols,cudaMemcpyDeviceToHost));
-            return pArray;
-        }
-        else
-        {
-            return NULL;
-        }
-    }
-
-    //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done
-    //return number of elements copied
-    template<class ElemType>
-    size_t  GPUMatrix<ElemType>::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const
-    {
-        size_t numElements = GetNumElements();
-
-        if (numElements > currentArraySize)
-        {
-            delete arrayCopyTo;
-            arrayCopyTo = new ElemType[numElements];  
-            currentArraySize = numElements;
-        }
-
-        if (numElements != 0)
-        {
-            PrepareDevice();
-            CUDA_CALL(cudaMemcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements, cudaMemcpyDeviceToHost));
-        }
-
-        return numElements;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::ChangeDeviceTo(DEVICEID_TYPE to_id)
-    {
-        if (!OwnBuffer())
-            throw std::logic_error("Cannot change device on Managed external matrix");
-        if (to_id == CPUDEVICE)
-            throw std::logic_error("to_id must be valid GPU");
-        if (m_computeDevice==to_id) 
-            return;
-
-        PrepareDevice((DEVICEID_TYPE)to_id);       
-        ElemType* d_dst=NULL;
-        CUDA_CALL(cudaMalloc((void**)&d_dst,sizeof(ElemType)*m_numRows*m_numCols));
-
-        m_elemSizeAllocated = m_numRows*m_numCols;
-
-        // check to make sure we have something to copy (on init we often have zero sized allocations)
-        if (m_elemSizeAllocated > 0)
-        {
-            // first try peer access
-            int canAccessPeer = false;
-            CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
-            if (canAccessPeer)
-            {
-                CUDA_CALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
-                CUDA_CALL(cudaMemcpyPeer(d_dst,to_id,m_pArray,m_computeDevice,sizeof(ElemType)*m_numRows*m_numCols));  
-            }
-            else
-            {
-                // peer access didn't work, just copy normal
-                // make this more efficient by keeping some buffers available for each copy
-                ElemType* h_dst=NULL;
-                PrepareDevice();
-                CUDA_CALL(cudaMallocHost((void**)&h_dst,sizeof(ElemType)*m_numRows*m_numCols));
-                CUDA_CALL(cudaMemcpy(h_dst,m_pArray,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyDeviceToHost));  
-                PrepareDevice((DEVICEID_TYPE)to_id);       
-                CUDA_CALL(cudaMemcpy(d_dst,h_dst,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyHostToDevice)); 
-                CUDA_CALL(cudaFreeHost(h_dst));  
-            }
-        }
-        PrepareDevice();
-        CUDA_CALL(cudaFree(m_pArray));
-        m_pArray=d_dst;
-
-        PrepareDevice((DEVICEID_TYPE)to_id);       
-        m_computeDevice=to_id;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::performInplaceFunction(int kind)    
-    {        
-        PrepareDevice();
-        LONG64 N= (LONG64) GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        switch (kind)
-        {
-        case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
-            break;
-        case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
-            break;
-        case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
-            break;
-        case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        case 6:
-            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        case 7:
-            _inplaceCosineOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        case 8:
-            _inplaceNegativeSineOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-            break;
-        } 
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));       
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-
-#pragma endregion Helper functions
-
-#pragma region Constructors and Destructor
-
-   //should only be used by constructors.
-    template<class ElemType>
-    void GPUMatrix<ElemType>::ZeroInit(int deviceId)
-    {
-        m_computeDevice = deviceId;
-        m_pArray = nullptr;
-        m_numRows = 0;
-        m_numCols = 0;
-        m_elemSizeAllocated = 0;
-        m_matrixName=NULL;
-        m_format = matrixFormatDense; 
-        m_externalBuffer = false;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(int deviceId) 
-    {
-        if (deviceId == MANAGEDEXTERN)
-            throw std::logic_error("Basic constructor cannot be used with Managed Extern types");
-
-        ZeroInit(deviceId);
-    };
-
-    //matrixName is used to verify that correct matrix is read.
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(FILE* f, const char * matrixName, int deviceId)
-    {
-        if (deviceId == MANAGEDEXTERN)
-            throw std::logic_error("File constructor cannot be used with Managed Extern types");
-
-        ReadFromFile(f, matrixName);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId)
-    {
-        if (deviceId == MANAGEDEXTERN)
-            throw std::logic_error("constructor cannot be used with Managed Extern types");
-        ZeroInit(deviceId);
-        m_numRows = numRows;
-        m_numCols = numCols;
-        m_elemSizeAllocated = GetNumElements();
-
-        if (m_elemSizeAllocated != 0)
-        {
-            PrepareDevice();        
-            CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated));      
-        CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated));  
-        }
-    };
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId)
-    {
-        ZeroInit(deviceId);
-        SetValue(numRows, numCols, pArray, matrixFlags, deviceId);
-    };               
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(const GPUMatrix<ElemType>& deepCopyFrom)
-    {
-        ZeroInit(deepCopyFrom.m_computeDevice);
-        SetValue(deepCopyFrom);
-        SetMatrixName(deepCopyFrom.m_matrixName);       
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::GPUMatrix(GPUMatrix<ElemType>&& moveFrom)
-    {
-        m_numRows = moveFrom.m_numRows;
-        m_numCols = moveFrom.m_numCols;
-        m_computeDevice = moveFrom.m_computeDevice;
-        m_pArray = moveFrom.m_pArray;  //shallow copy the pointer       
-        m_matrixName=moveFrom.m_matrixName;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
-        m_format = moveFrom.m_format;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-
-        //release the pointer from the source object so that the destructor won't release it twice
-        moveFrom.ZeroInit(0);       
-    }
-
-    //assignment operator, deep copy
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(const GPUMatrix<ElemType>& deepCopyFrom)  
-    {
-        if (this != &deepCopyFrom)
-        {
-            SetValue(deepCopyFrom);
-            SetMatrixName(deepCopyFrom.m_matrixName);       
-        }
-        return *this;
-    }
-
-    //move assignment operator, shallow copy
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(GPUMatrix<ElemType>&& moveFrom)  
-    {
-        if (this != &moveFrom)
-        {
-            if (OwnBuffer() && m_pArray!=NULL)
-            {
-                CUDA_CALL(cudaFree(m_pArray));  
-            }
-
-            m_numRows = moveFrom.m_numRows;
-            m_numCols = moveFrom.m_numCols;
-            m_elemSizeAllocated =  moveFrom.m_elemSizeAllocated;
-            m_pArray = moveFrom.m_pArray;
-            m_computeDevice = moveFrom.m_computeDevice;
-            m_format = moveFrom.m_format;
-            m_externalBuffer = moveFrom.m_externalBuffer;
-
-            //release the pointer from the source object so that the destructor won't release it twice
-            moveFrom.ZeroInit(0);
-        }
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>::~GPUMatrix(void)
-    {
-        Clear();
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Clear()
-    {
-        if (OwnBuffer() && m_pArray!=NULL)
-        {
-            if (m_computeDevice>=0)
-            {            
-                PrepareDevice();
-                cudaFree(m_pArray);
-                m_pArray = NULL;
-                m_elemSizeAllocated = 0;
-            }        
-        }
-        BaseMatrix<ElemType>::Clear();
-
-        ZeroInit(m_computeDevice);
-    }
-#pragma endregion Constructors and Destructor 
-
-    template<class ElemType>
-    int GPUMatrix<ElemType>::GetComputeDeviceId() const 
-    {
-        // for externally managed memory the CUDA context will have the current device
-        if (m_computeDevice == MANAGEDEXTERN)
-        {
-            int devId;
-            assert(m_externalBuffer);
-            CUDA_CALL(cudaGetDevice(&devId));
-            return devId;
-        }
-        return m_computeDevice;
-    }
-
-#pragma region Basic Operators
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
-    {
-        if (numCols == 0)
-            throw std::logic_error("The slice cannot have 0 columns.");
-
-        if (startColumn + numCols > m_numCols)
-            throw std::logic_error("The slice is out of range of the source matrix.");
-            
-        GPUMatrix<ElemType> slice(m_numRows, numCols, m_pArray + startColumn * m_numRows, matrixFlagDontOwnBuffer, m_computeDevice);
-
-        return slice;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
-    {
-        if (numCols == 0)
-            throw std::logic_error("The slice cannot have 0 columns.");
-
-        if (startColumn + numCols > m_numCols)
-            throw std::logic_error("The slice is out of range of the source matrix.");
-        
-        Clear();
-
-        m_computeDevice=fromMatrix.m_computeDevice;
-        m_externalBuffer=true;
-        m_numRows = fromMatrix.m_numRows;
-        m_pArray=fromMatrix.m_pArray + startColumn * m_numRows;
-
-        m_elemSizeAllocated = GetNumElements();
-        m_matrixName=NULL;
-        m_format = fromMatrix.m_format;
-
-        return *this;
-    }     
-
-
-    //for each column of a, we assign numRows starting from startIndex to this
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignRowSliceValuesOf: input matrix a is empty.");
-
-        if (startIndex + numRows > a.GetNumRows())
-            throw std::logic_error("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
-
-        Resize(numRows, a.GetNumCols());
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignRowSliceValuesOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)numRows, (long)a.GetNumRows());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    //for the row slice of this starting from startIndex we add a to it.
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AddToRowSliceValuesOf: input matrix a is empty.");
-
-        if (a.GetNumRows() != numRows)
-            throw std::logic_error("AddToRowSliceValuesOf: a.GetNumRows() != numRows.");
-
-        if (startIndex + numRows > GetNumRows())
-            throw std::logic_error("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows().");
-
-        if (a.GetNumCols() != GetNumCols())
-            throw std::logic_error("AddToRowSliceValuesOf: columns does not match.");
-
-        LONG64 N=(LONG64)a.GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _addToRowSliceValuesOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    //for each column of this, we add row slice of a starting from startIndex
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AddWithRowSliceValuesOf: input matrix a is empty.");
-
-        if (GetNumRows() != numRows)
-            throw std::logic_error("AddWithRowSliceValuesOf: GetNumRows() != numRows.");
-
-        if (startIndex + numRows > a.GetNumRows())
-            throw std::logic_error("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
-
-        if (a.GetNumCols() != GetNumCols())
-            throw std::logic_error("AddWithRowSliceValuesOf: columns does not match.");
-
-        LONG64 N = (LONG64)GetNumElements();
-        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _addWithRowSliceValuesOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
-    {
-        if (this == &a)
-            throw std::logic_error("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat.");
-
-        if (a.IsEmpty())
-            throw std::logic_error("AssignRepeatOf: Matrix a is empty.");
-
-        Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats);
-
-        LONG64 N = (LONG64)GetNumElements();
-        long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
-        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignRepeatOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, a.m_pArray, N, m, n, (long)GetNumRows());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-        
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::Transpose() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("Transpose: Matrix is empty.");
-
-        GPUMatrix<ElemType> c(GetComputeDeviceId());
-        c.AssignTransposeOf(*this);
-        return c;
-    }
-
-    // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU
-    // computeDevice - The compute device for which the cublas handle is desired
-    // returns: cublas handle
-    // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends
-    template<class ElemType>
-    cublasHandle_t GPUMatrix<ElemType>::GetCublasHandle(int computeDevice/*=-1*/)
-    {
-        // if the compute device is not passed, get the current device from CUDA
-        if (computeDevice < 0)
-            cudaGetDevice(&computeDevice);
-
-        if (computeDevice < 0 || computeDevice >= MaxGpus)
-            throw std::logic_error("GetCublasHandle: Maximum GPU exceeded");
-        cublasHandle_t cuHandle = s_cuHandle[computeDevice];
-        if (cuHandle == NULL)
-        {
-            s_cuHandle[computeDevice] = cuHandle = _initCUBLAS<ElemType>(computeDevice);
-        }
-        CUBLAS_CALL(cublasSetStream(cuHandle, t_stream));
-
-        return cuHandle;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf (const GPUMatrix<ElemType>& a)
-    {
-        if (this == &a)
-            throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose.");
-
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTransposeOf: Matrix a is empty.");
-
-        if (GetNumRows()!=a.GetNumCols() || GetNumCols()!=a.GetNumRows())
-            Resize(a.GetNumCols(), a.GetNumRows());
-
-        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-        cublasOperation_t transA =  CUBLAS_OP_T;
-        cublasOperation_t transB =  CUBLAS_OP_T;
-        int m = (int)a.m_numCols;
-        int n = (int)a.m_numRows;                
-        ElemType alpha=1;
-        ElemType beta=0;
-        cublasStatus_t st;
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            st = cublasSgeam(cuHandle,transA,transB,m,n,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(m_pArray),(int)m_numRows);
-        }
-        else if (sizeof(ElemType)==sizeof(double))
-        {            
-            st = cublasDgeam(cuHandle,transA,transB,m,n,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(m_pArray),(int)m_numRows);
-        }
-        else  
-        {
-            throw std::runtime_error("Unsupported template argument in GPUMatrix"); 
-        }
-        if (st!=CUBLAS_STATUS_SUCCESS)
-        {
-            throw std::runtime_error("AssignTransposeOf failed");     
-        }
-        m_numRows=a.m_numCols;
-        m_numCols=a.m_numRows;
-        SetMatrixName(a.GetMatrixName());
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetValue(const ElemType v)
-    {
-        if (IsEmpty())
-            throw std::logic_error("SetValue: Matrix is empty.");
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _setValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,v,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory
-    {
-        if (IsEmpty())
-            throw std::logic_error("SetValue: Matrix is empty.");
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _setValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,d_v,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done)); 
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetColumn(const ElemType* colPointer, size_t colInd)
-    {
-        if (IsEmpty())
-            throw std::logic_error("SetValue: Matrix is empty.");
-        if (colPointer==NULL)
-            return;
-        CUDA_CALL(cudaMemcpy(m_pArray+LocateColumn(colInd),colPointer,sizeof(ElemType)*m_numRows,cudaMemcpyHostToDevice));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& deepCopyFrom)
-    {
-        if (this == &deepCopyFrom)
-            return;
-
-        Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols());
-        m_format = deepCopyFrom.m_format; // copy the format over just to be sure
-        size_t cpSize = deepCopyFrom.GetNumRows() * deepCopyFrom.GetNumCols();
-        if (cpSize != 0)
-            CUDA_CALL(cudaMemcpy(m_pArray,deepCopyFrom.m_pArray,cpSize*sizeof(ElemType),cudaMemcpyDeviceToDevice));        
-    }
-
-    template<class ElemType>    
-    void GPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId)
-    {
-        // handle externally managed case
-        if (matrixFlags&matrixFlagDontOwnBuffer)
-        {
-            // free the existing array if it used to be an owned array
-            if (OwnBuffer() && m_pArray!=NULL)
-            {
-                PrepareDevice();
-                CUDA_CALL(cudaFree(m_pArray));
-            }
-            m_numRows = numRows;
-            m_numCols = numCols;
-            m_pArray = pArray;
-            m_elemSizeAllocated = GetNumElements();
-            m_matrixName = NULL;
-            m_format = matrixFormatDense;
-            m_externalBuffer = true;
-            m_computeDevice = deviceId;
-        }
-        else 
-        {
-            // if didn't previously own the buffer, wipe it clean 
-            if (!OwnBuffer())
-            {
-                ZeroInit(deviceId);
-            }
-
-            // if the devices are different move it now
-            if (m_computeDevice != deviceId && deviceId >= 0)
-            {
-                Clear();
-                ZeroInit(deviceId);
-            }
-
-            // now resize/allocate as necessary
-            Resize(numRows, numCols);
-            m_externalBuffer = false;
-
-            // copy over the content to the buffer
-            PrepareDevice();
-            if (pArray!=NULL) 
-            {
-                if (!(matrixFlags&matrixFormatRowMajor))
-                {
-                    CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(), 
-                        (matrixFlags&matrixFlagSetValueOnDevice)?cudaMemcpyDeviceToDevice:cudaMemcpyHostToDevice));
-                }
-                else
-                {
-                    throw std::runtime_error("Row major isn't implemented");
-                }
-            }
-        }
-        m_format = matrixFormatDense;
-    }
-
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetDiagonalValue(const ElemType v)
-    {
-        unsigned long N=(unsigned long)GetNumRows();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _setDiagonalValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,v,N,(unsigned long)GetNumRows());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetDiagonalValue(GPUMatrix<ElemType>& vector)
-    {
-        if (IsEmpty() || vector.IsEmpty())
-            throw std::logic_error("SetDiagonalValue: Matrix is empty.");
-
-        if (GetNumRows() != GetNumCols())
-            throw std::logic_error("SetDiagonalValue: NumRows and NumCols do not agree.");
-
-        if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1)
-            throw std::logic_error("SetDiagonalValue: input vector must be a vector.");
-
-        if (vector.GetNumElements() == 1) //reduce to simple form
-            SetDiagonalValue(vector.m_pArray[0]);
-
-        else if (vector.GetNumRows() != GetNumRows())
-            throw std::logic_error("SetDiagonalValue: input vector's dimension does not agree with [this].");
-        else
-        {
-            long N=(long)GetNumRows();
-            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-            PrepareDevice();
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-            _setDiagonalValueFromVector<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,vector.m_pArray,N);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
-    {
-        PrepareDevice();
-        if (s_curandGenerator==NULL)
-        {            
-            s_curandGenerator = new curandGenerator_t;
-            /* Create pseudo-random number generator */        
-            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
-            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
-            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
-        }
-
-        cudaEvent_t done = nullptr;
-        CUDA_CALL(cudaEventCreate(&done));
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<float*>(m_pArray), GetNumElements()));
-        }
-        else
-        {
-            CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<double*>(m_pArray), GetNumElements()));
-        }
-        CUDA_CALL(cudaEventRecord(done));        
-        CUDA_CALL(cudaEventSynchronize(done)); 
-        //CURAND_CALL(curandDestroyGenerator(gen));
-        CUDA_CALL(cudaEventDestroy(done));
-
-        size_t N=GetNumElements();
-        size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock);
-
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _rescaleToRange<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N,low,high);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
-    {
-        PrepareDevice();
-        if (s_curandGenerator==NULL)
-        {            
-            s_curandGenerator = new curandGenerator_t;
-            /* Create pseudo-random number generator */        
-            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
-            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
-            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
-        }
-
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CURAND_CALL(curandGenerateNormal(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<float*>(m_pArray), GetNumElements(), (float)mean, (float)sigma));
-        }
-        else
-        {
-            CURAND_CALL(curandGenerateNormalDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<double*>(m_pArray), GetNumElements(), (double)mean, (double)sigma));
-        }
-        //CURAND_CALL(curandDestroyGenerator(gen));
-    }
-
-    //maskRate: percentage of values masked out (similar to dropout rate)
-    //scaleValue: which scale value to set to the left ones (unmasked items).
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
-    {
-        PrepareDevice();
-        if (s_curandGenerator==NULL)
-        {            
-            s_curandGenerator = new curandGenerator_t;
-            /* Create pseudo-random number generator */        
-            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
-            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
-            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
-        }
-
-        cudaEvent_t done = nullptr;
-        CUDA_CALL(cudaEventCreate(&done));
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CURAND_CALL(curandGenerateUniform((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast<float*>(m_pArray), GetNumElements()));
-        }
-        else
-        {
-            CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast<double*>(m_pArray), GetNumElements()));
-        }
-        CUDA_CALL(cudaEventRecord(done));        
-        CUDA_CALL(cudaEventSynchronize(done)); 
-        CUDA_CALL(cudaEventDestroy(done));
-        //CURAND_CALL(curandDestroyGenerator(gen));
-
-        size_t N=GetNumElements();
-        size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock);        
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _setMaskAndScale<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N,maskRate,scaleValue);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& gradients)
-    {
-        if (IsEmpty())
-        {
-            Resize(gradients.GetNumRows(), gradients.GetNumCols());
-            SetValue(0.0);
-        }
-
-        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols());
-
-        int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock;
-        _adagrad<ElemType><<<blocksPerGrid, threadsPerBlock>>>(m_pArray, gradients.m_pArray, GetNumElements());
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients,
-        ElemType RMS_GAMMA,
-        ElemType RMS_WGT_INC,
-        ElemType RMS_WGT_MAX,
-        ElemType RMS_WGT_DEC,
-        ElemType RMS_WGT_MIN
-        )
-    {
-        const ElemType floor = 1e-6f;
-        static ElemType *upd_gpu = (ElemType*)0;
-
-        size_t n = gradients.GetNumElements();
-        int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock;
-
-        if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3)
-        {
-            Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3);
-            SetValue(0.0);
-
-            ElemType *avars=m_pArray; // accumulated variances for RMS scaling
-            ElemType *signs=m_pArray+n; // sign of previous gradient
-            ElemType *steps=m_pArray+2*n; // current step size
-
-            _rmsprop_init<ElemType><<<blocksPerGrid, threadsPerBlock>>>(avars,signs,steps,gradients.m_pArray,n);
-
-        }
-
-        ElemType *avars=m_pArray; // accumulated variances for RMS scaling
-        ElemType *signs=m_pArray+n; // sign of previous gradient
-        ElemType *steps=m_pArray+2*n; // current step size
-
-        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3);
-
-        if( !upd_gpu )
-        {
-            ElemType upd[] = {
-                2,2,0,
-                2,2,0,
-                1,1,1,
-                2,2,0,
-                1,2,1,
-                0,2,2,
-                1,1,1,
-                0,2,2,
-                0,2,2,
-            };
-
-            CUDA_CALL(cudaMalloc((void**)&upd_gpu,sizeof(ElemType)*27));
-            CUDA_CALL(cudaMemcpy(upd_gpu,upd,sizeof(ElemType)*27,cudaMemcpyHostToDevice));
-        }
-
-        _rmsprop<ElemType><<<blocksPerGrid, threadsPerBlock>>>(avars,signs,steps,gradients.m_pArray,n,
-            RMS_GAMMA,RMS_WGT_INC,RMS_WGT_MAX,RMS_WGT_DEC,RMS_WGT_MIN,
-            floor,upd_gpu);
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
-    {
-        assert (numRows*numCols == GetNumElements());
-        if (numRows*numCols != GetNumElements())
-            throw std::invalid_argument("Reshape: total number of elements does not match.");
-
-        m_numRows = numRows;
-        m_numCols = numCols;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
-    {
-        if (m_numRows==numRows && m_numCols==numCols)
-            return;   
-
-        m_numRows = numRows;
-        m_numCols = numCols;
-
-        size_t numElements = GetNumElements();
-        if (numElements > m_elemSizeAllocated || (!growOnly && numElements != m_elemSizeAllocated))
-        {
-            if (IsEmpty())
-            {
-                m_elemSizeAllocated = 0;
-                m_pArray = NULL;
-            }
-            else
-            {            
-                if (!OwnBuffer())
-                    throw std::invalid_argument("Can't resize a externally managed matrix");
-                PrepareDevice();
-                if (m_pArray!=NULL)
-                    CUDA_CALL(cudaFree(m_pArray)); //delete and reallocate                            
-                m_elemSizeAllocated = numElements;
-                CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated));
-                CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated));
-            }
-        }
-    }
-
-    template<class ElemType>
-    size_t GPUMatrix<ElemType>::LocateElement (const size_t row, const size_t col) const 
-    { 
-        assert (row < m_numRows && col < m_numCols); 
-        return col * m_numRows  + row;  // matrix in column-wise storage
-    }  
-
-    template<class ElemType>
-    size_t GPUMatrix<ElemType>::LocateColumn (const size_t col) const 
-    { 
-        assert (col < m_numCols); 
-        return col * m_numRows;  // matrix in column-wise storage
-    }  
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::Get00Element() const 
-    {        
-        ElemType res=0;        
-        CUDA_CALL(cudaMemcpy(&res,m_pArray,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        return res;
-    }
-#pragma endregion Basic Operators
-
-#pragma region Member BLAS Functions
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (ElemType alpha) 
-    {
-        if (IsEmpty())
-            throw std::logic_error("operator+=: Matrix is empty.");
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (ElemType alpha) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("operator+: Matrix is empty.");
-
-        const GPUMatrix<ElemType>& us=*this;
-        GPUMatrix<ElemType> c(us);
-        c+=alpha;
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        (*this)+=alpha;
-        return (*this);
-    }
-
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (const GPUMatrix<ElemType>& a) 
-    {
-        //if (a.GetNumElements()==1)
-        //{
-        //    //*this += a.Get00Element();
-        //    LONG64 N=(LONG64)GetNumElements();
-        //    int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        //    cudaEvent_t done = nullptr;
-        //    if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
-        //    if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        //    if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        //    if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        //}
-        //else 
-        //{
-            ScaleAndAdd(1, a, *this);
-        //}
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (const GPUMatrix<ElemType>& a) const
-    {
-        if (GetNumElements()==1)
-        {
-            GPUMatrix<ElemType> c(a);
-            c+=Get00Element();
-            return c;
-        }
-        else if (a.GetNumElements()==1)
-        {
-            GPUMatrix<ElemType> c(*this);
-            c+=a.Get00Element();
-            return c;
-        }
-        else
-        {
-            GPUMatrix<ElemType> c(*this); //this implementation will introduce a copy overhead. but make resue of the code
-            c += a;
-            return c;
-        }
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        SetValue(a);
-        (*this)+=b;
-        return (*this);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (ElemType alpha) 
-    {
-        if (IsEmpty())
-            throw std::logic_error("operato-=: Matrix is empty.");
-        return operator+=(-1*alpha);        
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (ElemType alpha) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("operator-: Matrix is empty.");
-        return operator+(-1*alpha);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
-    {
-        Resize(a.m_numRows,a.m_numCols);
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignDifferenceOf1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,a.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-        /*Resize(a.m_numRows,a.m_numCols);
-        SetValue(alpha);
-        (*this)-=a;
-        return *this;*/
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const ElemType alpha)
-    {
-        Resize(a.m_numRows,a.m_numCols);
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignDifferenceOf2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,a.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-        /*SetValue(a);
-        (*this)-=alpha;
-        return *this;*/
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (const GPUMatrix<ElemType>& a)
-    {
-        //if (a.GetNumElements() == 1)
-        //    AssignDifferenceOf(*this, a.Get00Element());
-        //else if (GetNumElements() == 1)
-        //    AssignDifferenceOf(Get00Element(), a);
-        //else
-            ScaleAndAdd(-1, a, *this);
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (const GPUMatrix<ElemType>& a) const
-    {
-        GPUMatrix<ElemType> c(*this); //this implementation will introduce a copy overhead. but make resue of the code
-        c -= a;
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (this != &a)
-        {
-            Resize(a.GetNumRows(), a.GetNumCols());
-            SetValue(a);
-        }
-        (*this) -= b;
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator*= (ElemType alpha)
-    {
-        Scale(alpha, *this);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (ElemType alpha) const
-    {
-        GPUMatrix<ElemType> c(GetNumRows(), GetNumCols());
-        Scale(alpha, *this, c);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
-    {
-        Scale(alpha, a, *this);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf (const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB)
-    {
-        if (a.GetNumElements() == 1)
-        {  
-            if (transposeB)
-                AssignTransposeOf(b);
-            (*this) *= a.Get00Element();
-        }
-        else if (b.GetNumElements() == 1)
-        { 
-            if (transposeA)
-                AssignTransposeOf(a);
-            (*this) *= b.Get00Element();
-        }
-        else
-            Multiply(a, transposeA, b, transposeB, *this);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (const GPUMatrix<ElemType>& a) const
-    {
-        const GPUMatrix<ElemType>& us = *this;
-        if (GetNumElements() == 1)
-        {
-            GPUMatrix<ElemType> c(GetComputeDeviceId());
-            c.AssignProductOf(Get00Element(), a);
-            return c;
-        }
-        else if (a.GetNumElements() == 1)
-        {
-            GPUMatrix<ElemType> c(GetComputeDeviceId());
-            c.AssignProductOf(a.Get00Element(), us);
-            return c;
-        }
-        else
-        {
-            GPUMatrix<ElemType> c(GetNumRows(),a.GetNumCols(),GetComputeDeviceId());
-            Multiply(*this, a, c);
-            return c;
-        }
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator/= (ElemType alpha)
-    {
-        (*this) *= 1/alpha;
-        return (*this);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator/ (ElemType alpha) const
-    {
-        return ((*this) * (1/alpha));
-    }
-
-    //element-wise power
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator^= (ElemType alpha)
-    {
-        GPUMatrix<ElemType>& us = *this;
-        ElementWisePower(alpha, us, us);
-        return us;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator^ (ElemType alpha) const
-    {
-        GPUMatrix<ElemType> c(GetNumRows(), GetNumCols());
-        ElementWisePower(alpha, *this, c);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementPowerOf(const GPUMatrix<ElemType>& a, const ElemType power)
-    {
-        ElementWisePower(power, a, *this);
-        return *this;
-    }
-
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AddElementProductOf: Matrix is empty.");
-
-        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
-        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-            throw std::invalid_argument("The input matrix dimensions do not match.");
-
-        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols()))
-            throw std::invalid_argument("The input matrix dimensions do not match [this].");
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);    
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ColumnElementMultiplyWith(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty() || IsEmpty())
-            throw std::logic_error("ColumnElementMultiplyWith: Matrix is empty.");
-
-        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
-            throw std::invalid_argument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows.");
-
-        long N=(long)a.GetNumRows();
-        long M=(long)GetNumCols();        
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);  
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N,M);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::RowElementMultiplyWith(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty() || IsEmpty())
-            throw std::logic_error("RowElementMultiplyWith: Matrix is empty.");
-
-        if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()))
-            throw std::invalid_argument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns.");
-
-        long N = (long)GetNumRows();
-        long M = (long)a.GetNumCols();
-        int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock);
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.m_pArray,N,M);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::RowElementDivideBy(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty() || IsEmpty())
-            throw std::logic_error("RowElementDivideBy: Matrix is empty.");
-
-        if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()))
-            throw std::invalid_argument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns.");
-
-        long N = (long)GetNumRows();
-        long M = (long)a.GetNumCols();
-        int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock);
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _rowElementDivideBy<ElemType> << <blocksPerGrid, threadsPerBlock >> >(m_pArray, a.m_pArray, N, M);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ColumnElementDivideBy(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty() || IsEmpty())
-            throw std::logic_error("ColumnElementDivideBy: Matrix is empty.");
-
-        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
-            throw std::invalid_argument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows.");
-
-        long N = (long)a.GetNumRows();
-        long M = (long)GetNumCols();
-        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _ColumnElementDivideBy<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N,M);                        
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementInverse ()
-    {
-        if (IsEmpty())
-            throw std::logic_error("ElementInverse: Matrix is empty.");
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);  
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));     
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementInverseOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        return ElementInverse();
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoid()
-    {
-        performInplaceFunction(0);                    
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidOf (const GPUMatrix<ElemType>& a)
-    {
-        Resize(a.GetNumRows(),a.GetNumCols());
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignSigmoidOf<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        /*SetValue(a);
-        InplaceSigmoid();*/
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoidDerivative()
-    {
-        AssignSigmoidDerivativeOf(*this);                    
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidDerivativeOf (const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignSigmoidDerivativeOf: Matrix a is empty.");
-
-        //auto& us=*this;
-        if (this != &a)
-            Resize(a.GetNumRows(), a.GetNumCols());
-
-        PrepareDevice();
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-
-        _assignSigmoidDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, m_pArray, N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTanh()
-    {
-        performInplaceFunction(1);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTanhOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceTanh();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLogSoftmax (const bool isColWise)
-    {
-        if (IsEmpty())
-            throw std::logic_error("InplaceLogSoftmax: Matrix is empty.");
-
-        PrepareDevice();
-        if (isColWise)
-        {
-            long N=(long)GetNumCols(); //one kernel per column
-            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);             
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-            _logSoftMaxColWise<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,(long)m_numCols,(long)m_numRows);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-        else
-        {
-            long N=(long)GetNumRows(); //one kernel per column
-            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-            _logSoftMaxRowWise<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,(long)m_numCols,(long)m_numRows);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-        return *this; 
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogSoftmaxOf (const GPUMatrix<ElemType>& a, const bool isColWise)
-    {
-        Resize(a.GetNumRows(),a.GetNumCols());        
-        if (isColWise)
-        {            
-            PrepareDevice();
-            long N = (long)GetNumCols();
-            long M = (long)GetNumRows();
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-            _assignColumnwiseLogSoftmaxOf<<<N,512,0,t_stream>>>(a.m_pArray,m_pArray,N,M);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSqrt()
-    {
-        performInplaceFunction(2);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSqrtOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceSqrt();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceExp()
-    {
-        performInplaceFunction(3);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignExpOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceExp();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLog()
-    {
-        performInplaceFunction(4);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceLog();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceAbs()
-    {
-        performInplaceFunction(5);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignAbsOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceAbs();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLinearRectifierDerivative()
-    {
-        performInplaceFunction(6);                    
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceLinearRectifierDerivative();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceCosine()
-    {
-        performInplaceFunction(7);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCosineOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceCosine();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceNegativeSine()
-    {
-        performInplaceFunction(8);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNegativeSineOf (const GPUMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceNegativeSine();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");    
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateBottomOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty.");
-
-        if (this!=&a)
-        {
-            Resize(a.GetNumRows(), a.GetNumCols());
-        }
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);      
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateTop: Matrix is empty.");
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);      
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;        
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateTopOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTruncateTopOf: Matrix a is empty.");
-
-        if (this!=&a)
-        {
-            Resize(a.GetNumRows(), a.GetNumCols());
-        }
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;        
-    }
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty.");
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;  
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("SumOfAbsElements: Matrix is empty");
-
-        cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId());          
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            float res=0;
-            cublasSasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<float*>(m_pArray),1,&res);
-            return res;
-        }
-        else
-        {
-            double res=0;
-            cublasDasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<double*>(m_pArray),1,&res);
-            return ElemType(res);
-        }         
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::SumOfElements() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("SumOfElements: Matrix is empty");
-
-        PrepareDevice();
-        ElemType* d_sum = NULL;
-        ElemType h_sum;
-        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements());
-        CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDA_CALL(cudaFree(d_sum));               
-        return h_sum;        
-    }
-
-    
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignSumOfElements: Matrix a is empty");
-
-        Resize(1,1);
-
-        PrepareDevice();     
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSumAndAssign<ElemType><<<1,1024>>>(m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)GetNumElements());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return (*this);
-    }
-
-    template<class ElemType>
-    DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("Matrix is empty");
-        PrepareDevice();
-        ElemType* d_sum = NULL;        
-        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements());
-        DeviceBoundNumber<ElemType> result;
-        result.ShallowCopyFrom(d_sum,GetComputeDeviceId());
-        return result;
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::Max() const
-    {
-        cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId());   
-        ElemType res;
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            int resInd=0;
-            cublasIsamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<float*>(m_pArray),1,&resInd); 
-            resInd--;
-            CUDA_CALL(cudaMemcpy(reinterpret_cast<float*>(&res),reinterpret_cast<float*>(m_pArray+resInd),sizeof(float),cudaMemcpyDeviceToHost));
-            return res;
-        }
-        else
-        {
-            int resInd=0;
-            cublasIdamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<double*>(m_pArray),1,&resInd);
-            resInd--;
-            CUDA_CALL(cudaMemcpy(reinterpret_cast<double*>(&res),m_pArray+resInd,sizeof(float),cudaMemcpyDeviceToHost));
-            return res;
-        }        
-    }
-
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementMultiplyWith (const GPUMatrix<ElemType>& a)
-    {
-        if (IsEmpty() || a.IsEmpty())
-            throw std::logic_error("ElementMultiplyWith: Matrix is empty.");
-
-        GPUMatrix<ElemType>& us=*this;
-        assert (us.GetNumRows() == a.GetNumRows() && us.GetNumCols() == a.GetNumCols());
-        if (us.GetNumRows() != a.GetNumRows() || us.GetNumCols() != a.GetNumCols())
-            throw std::invalid_argument("The matrix dimensions do not match.");
-
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); 
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AssignElementProductOf: Matrix is empty.");
-
-        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
-        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-            throw std::invalid_argument("The input matrix dimensions do not match.");
-
-        Resize(a.GetNumRows(), a.GetNumCols());
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock);  
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementDivideBy(const GPUMatrix<ElemType>& a)
-    {
-        return AssignElementDivisionOf(*this, a);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementDivisionOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AssignElementDivisionOf: Matrix is empty.");
-
-        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
-        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-            throw std::invalid_argument("The input matrix dimensions do not match.");
-
-        Resize(a.GetNumRows(), a.GetNumCols());
-        LONG64 N=(LONG64)GetNumElements();
-        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock);  
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    bool GPUMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
-    {
-        return AreEqual(*this, a, threshold);
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::VectorNorm1(GPUMatrix<ElemType>& c, const bool isColWise) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("VectorNorm1: Matrix is empty.");
-
-        const long n = (long)GetNumRows();
-        const long m = (long)GetNumCols();
-        assert (m>0 && n>0); //converting from size_t to int may cause overflow
-
-        cudaEvent_t done = nullptr;  
-        PrepareDevice();
-        c.ChangeDeviceTo(GetComputeDeviceId());
-
-        int blocksPerGrid=0;
-        if (isColWise)  //col-wise
-        {
-            c.Resize(1,m);   
-            blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                                        
-        }
-        else
-        {
-            c.Resize(n, 1);
-            c.ChangeDeviceTo(GetComputeDeviceId());
-            blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                        
-        }       
-
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, m_pArray,n,m,isColWise);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm1Of(GPUMatrix<ElemType>& a, const bool isColWise)
-    {
-        a.VectorNorm1(*this, isColWise);
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::VectorNorm2(GPUMatrix<ElemType>& c, const bool isColWise) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("VectorNorm2: Matrix is empty.");
-
-        const long n = (long)GetNumRows();
-        const long m = (long)GetNumCols();
-        assert (m>0 && n>0); //converting from size_t to int may cause overflow
-
-        cudaEvent_t done = nullptr;  
-        PrepareDevice();
-        c.ChangeDeviceTo(GetComputeDeviceId());
-
-        int blocksPerGrid=0;
-        if (isColWise)  //col-wise
-        {
-            c.Resize(1,m);   
-            blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                                        
-        }
-        else
-        {
-            c.Resize(n, 1);
-            c.ChangeDeviceTo(GetComputeDeviceId());
-            blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                        
-        }       
-
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, m_pArray,n,m,isColWise);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm2Of(GPUMatrix<ElemType>& a, const bool isColWise)
-    {
-        a.VectorNorm2(*this, isColWise);
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::VectorNormInf(GPUMatrix<ElemType>& c, const bool isColWise) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("VectorMax: Matrix is empty.");
-
-        //this implementation is not efficient
-        GPUMatrix<ElemType> tmp;
-        GPUMatrix<ElemType> tmp1;
-        tmp.AssignAbsOf((*this));
-        tmp.VectorMax(tmp1,c,isColWise);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNormInfOf(GPUMatrix<ElemType>& a, const bool isColWise)
-    {
-        a.VectorNormInf(*this, isColWise);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool isColWise)
-    {
-        InnerProduct (a, b, *this,isColWise);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignKhatriRaoProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AssignKhatriRaoProductOf: Matrix is empty.");
-
-        long cols = a.GetNumCols();
-        assert (cols == b.GetNumCols());
-        if (!(cols == b.GetNumCols()))
-            throw std::invalid_argument("AssignKhatriRaoProductOf: The input matrix dimensions do not match.");
-
-        long rowsA = (long)a.GetNumRows();
-        long rowsB = (long)b.GetNumRows();
-        Resize(rowsA * rowsB, cols);
-        float N=(float)GetNumElements();
-        int blocksPerGrid =(int)ceil(N/threadsPerBlock);  
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient
-    //   this = reshape each column of a from (K1xK2,1) to (K1, K2) 
-    //   if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames).
-    //   the output is a (K1, frames) matrix
-    //   if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames)
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddColumnReshapeProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool transposeAColumn)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AddColumnReshapeProductOf: Matrix is empty.");
-
-        long cols = a.GetNumCols();
-        assert (cols == b.GetNumCols());
-        if (!(cols == b.GetNumCols()))
-            throw std::invalid_argument("AddColumnReshapeProductOf: The input matrix dimensions do not match.");
-
-        long rowsA = (long)a.GetNumRows();
-        long rowsB = (long)b.GetNumRows();
-        if (rowsA % rowsB != 0)
-            throw std::invalid_argument("AddColumnReshapeProductOf: number of rows in a should be multiples of that in b.");
-
-        long rowsC = rowsA / rowsB;
-        if (rowsC != GetNumRows() || cols != GetNumCols())
-            throw  std::invalid_argument("AddColumnReshapeProductOf: This matrix does not have the right size.");
-
-        float N=(float)GetNumElements();
-        int blocksPerGrid =(int)ceil(N/threadsPerBlock);  
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithScaleOf(ElemType alpha, const GPUMatrix<ElemType>& a)
-    {
-        ScaleAndAdd(alpha, a, *this);
-        return *this;
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::FrobeniusNorm() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
-
-        PrepareDevice();
-        ElemType* d_sum = NULL;
-        ElemType h_sum=0;
-        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum2<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements(), true);
-        CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDA_CALL(cudaFree(d_sum));               
-
-        return (h_sum); 
-    }
-    
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf (const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignFrobeniusNormOf: Matrix a is empty.");
-
-        Resize(1,1);        
-    
-        PrepareDevice();
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum2<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray,m_pArray,(LONG64)a.GetNumElements(), true);
-
-        return *this;
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::MatrixNormInf() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("MatrixNorm1: Matrix is empty.");
-
-        PrepareDevice();
-        ElemType* d_maxAbs = NULL;
-        ElemType h_maxAbs=0;
-        CUDA_CALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionMatrixNormInf<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_maxAbs,(LONG64)GetNumElements());
-        CUDA_CALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDA_CALL(cudaFree(d_maxAbs));               
-        return h_maxAbs; 
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::MatrixNorm1() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("MatrixNorm1: Matrix is empty.");
-        return SumOfAbsElements();              
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::MatrixNorm0() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("MatrixNorm0: Matrix is empty.");
-
-        PrepareDevice();
-        ElemType* d_nz = NULL;
-        ElemType h_nz=0;
-        CUDA_CALL(cudaMalloc((void**)&d_nz,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionMatrixNorm0<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_nz,(LONG64)GetNumElements());
-        CUDA_CALL(cudaMemcpy(&h_nz,d_nz,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDA_CALL(cudaFree(d_nz));               
-        return h_nz; 
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSignOf(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignSignOf: Matrix a is empty.");
-
-        if (this != &a)
-            Resize(a.GetNumRows(), a.GetNumCols());
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock);  
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, (long)GetNumElements());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddSignOf(const GPUMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AddSignOf: Matrix a is empty.");
-
-        if (this != &a)
-            Resize(a.GetNumRows(), a.GetNumCols());
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock);  
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, (LONG64)GetNumElements());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("VectorMax: Matrix is empty.");
-
-        const GPUMatrix<ElemType>& us=*this;
-        const long m = (long)GetNumRows();
-        const long n = (long)GetNumCols();
-        assert (m>0 && n>0); //converting from size_t to int may cause overflow
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)     CUDA_CALL(cudaEventCreate(&done));                
-        if (isColWise)
-        {
-            maxValues.Resize(1, n);
-            maxIndexes.Resize(1, n);
-
-            int blocksPerGrid = n; //we'll have 1 block processing 1 column
-            _vectorMaxMinReduce<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,true);
-
-            /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);*/
-        }
-        else
-        {
-            maxValues.Resize(m, 1);
-            maxIndexes.Resize(m, 1);
-            int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock);  
-            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);
-        }
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const
-    {
-        if (IsEmpty())
-            throw std::logic_error("VectorMax: Matrix is empty.");
-
-        const GPUMatrix<ElemType>& us=*this;
-        const int m = (int)GetNumRows();
-        const int n = (int)GetNumCols();
-
-        assert (m>0 && n>0); //converting from size_t to int may cause overflow
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));                
-        if (isColWise)
-        {
-            minValues.Resize(1, n);
-            minIndexes.Resize(1, n);
-
-            int blocksPerGrid = n; //we'll have 1 block processing 1 column
-            _vectorMaxMinReduce<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,false);
-
-            /*
-            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            _vectorMin<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);*/
-        }
-        else
-        {
-            minValues.Resize(m, 1);
-            minIndexes.Resize(m, 1);
-            int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock);  
-            _vectorMin<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);
-        }
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            throw std::invalid_argument ("AssignNumOfDiff: a and b must have same dimension.");
-
-        Resize(1,1); //result should be one element
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements());
-        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (LONG64)a.GetNumElements());
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-#pragma endregion Member BLAS Functions    
-
-#pragma region Other helper functions
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Print(const char* /*matrixName*/, size_t /*rowStart*/, size_t /*rowEnd*/, size_t /*colStart*/, size_t /*colEnd*/) const
-    {
-        NOT_IMPLEMENTED;
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
-    {
-        Print(matrixName, 0, GetNumRows()-1, 0, GetNumCols()-1);
-    }
-
-    // file I/O
-    //matrixName is used to verify that correct matrix is read.
-    template<class ElemType>
-    void GPUMatrix<ElemType>::ReadFromFile(FILE*, const char * /*matrixName*/)
-    {
-        NOT_IMPLEMENTED;
-    }
-
-    //matrixName is used to verify that correct matrix is read.
-    template<class ElemType>
-    void GPUMatrix<ElemType>::WriteToFile(FILE*, const char * /*matrixName*/)
-    {
-        NOT_IMPLEMENTED;
-    }
-
-    //helpfer function used for convolution neural network 
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPackedConvolutionInput(const GPUMatrix<ElemType>& inputSubBatch, 
-                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                            const bool zeroPadding)
-    {
-        assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth);
-
-        size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
-        size_t packedInputColsPerSample = outputWidth * outputHeight;
-        size_t smallBatchSize = inputSubBatch.GetNumCols();
-        Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
-        if (zeroPadding) 
-            SetValue((ElemType)0);
-
-        PrepareDevice();
-        int numThreadPerBlock = threadsPerBlock; 
-#if 1
-        int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; 
-#else
-        dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize);
-#endif
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignPackedConvolutionInput<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, 
-                                            inputSubBatch.m_pArray, 
-                                            smallBatchSize,
-                                            inputWidth, inputHeight, inputChannels,
-                                            outputWidth, outputHeight, outputChannels,
-                                            kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    //helpfer function used for convolution neural network 
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::UnpackConvolutionInput(GPUMatrix<ElemType>& inputSubBatch, 
-                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                            const bool zeroPadding) const
-    {
-        assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth);
-
-        size_t smallBatchSize = inputSubBatch.GetNumCols();
-
-        PrepareDevice();
-        int numThreadPerBlock = threadsPerBlock; 
-#if 1
-        int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; 
-#else
-        dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize);
-#endif
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _unpackConvolutionInput<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, 
-                                            inputSubBatch.m_pArray, 
-                                            smallBatchSize,
-                                            inputWidth, inputHeight, inputChannels,
-                                            outputWidth, outputHeight, outputChannels,
-                                            kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return inputSubBatch;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignMaxPoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
-                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
-    {
-        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
-
-        unsigned int batchSize = inputBatch.GetNumCols();
-        Resize(outputSizePerSample, batchSize);
-
-        int numThreadPerBlock = threadsPerBlock; 
-        int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignMaxPoolingResult<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, inputBatch.m_pArray, batchSize, channels,
-                                                 inputWidth, inputHeight,inputSizePerSample, 
-                                                 outputWidth, outputHeight, outputSizePerSample, 
-                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddMaxPoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, const GPUMatrix<ElemType>& inputBatch, const GPUMatrix<ElemType>& outputBatch, 
-                                                const size_t channels, 
-                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
-    {
-        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
-
-        unsigned int batchSize = outputGradientBatch.GetNumCols();
-        int numThreadPerBlock = threadsPerBlock; 
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-
-        int blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
-        _addMaxPoolingGradient<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, outputGradientBatch.m_pArray, inputBatch.m_pArray, outputBatch.m_pArray, batchSize, channels,
-                                                 inputWidth, inputHeight,inputSizePerSample, 
-                                                 outputWidth, outputHeight,  outputSizePerSample, 
-                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignAveragePoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
-                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
-    {
-        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
-
-        unsigned int batchSize = inputBatch.GetNumCols();
-        Resize(outputSizePerSample, batchSize);
-
-        int numThreadPerBlock = threadsPerBlock; 
-        int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _assignAveragePoolingResult<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, inputBatch.m_pArray, batchSize, channels,
-                                                 inputWidth, inputHeight,inputSizePerSample, 
-                                                 outputWidth, outputHeight, outputSizePerSample, 
-                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, 
-                                                const size_t channels, 
-                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
-    {
-        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
-
-        size_t batchSize = outputGradientBatch.GetNumCols();
-        int numThreadPerBlock = threadsPerBlock; 
-
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-
-        size_t blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
-        _addAveragePoolingGradient<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, outputGradientBatch.m_pArray, (long)batchSize, channels,
-                                                 inputWidth, inputHeight,inputSizePerSample, 
-                                                 outputWidth, outputHeight,  outputSizePerSample, 
-                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-
-        return *this;
-    }
-
-#pragma endregion Other helper functions
-
-#pragma region Static BLAS Functions
-    template<class ElemType>
-    void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, 
-        ElemType beta, GPUMatrix<ElemType>& c)
-    {
-        a.PrepareDevice();
-        if ((a.GetComputeDeviceId()!=b.GetComputeDeviceId()) || (b.GetComputeDeviceId()!=c.GetComputeDeviceId())) //different GPUs
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {  
-            cublasHandle_t cuHandle = GetCublasHandle(b.GetComputeDeviceId());
-            cublasOperation_t transA =  transposeA ? CUBLAS_OP_T : CUBLAS_OP_N;
-            cublasOperation_t transB =  transposeB ? CUBLAS_OP_T : CUBLAS_OP_N;
-            int m = int(transposeA ? a.m_numCols : a.m_numRows);
-            int n = int(transposeB ? b.m_numRows : b.m_numCols);
-            int k = int(transposeA ? a.m_numRows : a.m_numCols);
-            int l = int(transposeB ? b.m_numCols : b.m_numRows);
-            c.Resize(m,n);
-
-            if (!(m>0 && k>0 && l>0 && n>0)) 
-            {
-                throw std::runtime_error("!(m>0 && k>0 && l>0 && n>0)");  //converting from size_t to int may cause overflow
-            }
-            if (k!=l) 
-            {
-                throw std::runtime_error("matrix dim mismatch in MultiplyAndWeightedAdd");
-            }
-            if (sizeof(ElemType)==sizeof(float))
-            {
-                CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(c.m_pArray),(int)c.m_numRows));
-            }
-            else if (sizeof(ElemType)==sizeof(double))
-            {            
-                CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(c.m_pArray),(int)c.m_numRows));
-            }
-            else 
-            {
-                throw std::runtime_error("Unsupported template argument in GPUMatrix");             
-            }
-            c.m_numRows=m;
-            c.m_numCols=n;
-        }
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::MultiplyAndAdd(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
-    {
-        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 1, c);
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
-    {    
-        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 0, c);
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, false, b, false, 0, c);
-    }
-
-    /// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a + c</summary>
-    /// if a is a column vector, add to all columns of c 
-    /// if a is a row vector, add to all rows of c    
-    /// if a is a scalar, add to all elements of c
-    /// <param name="alpha">Scalar</param>
-    /// <param name="a">Input matrix</param>
-    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-    template<class ElemType>
-    void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
-    {
-        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {
-            a.PrepareDevice();
-            if (a.IsEmpty() || c.IsEmpty())
-                throw std::logic_error("ScaleAndAdd:  one of the input matrices is empty.");
-            //if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector
-            if (a.GetNumRows()==c.GetNumRows() && a.GetNumCols()==c.GetNumCols()) // dimensions match
-            {
-                const int m = (int)a.GetNumRows();
-                const int n = (int)a.GetNumCols();
-                const int len = m * n;
-                const int incx = 1;
-                const int incy = 1;
-
-                assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow
-                assert ((int)c.GetNumRows() == m && (int)c.GetNumCols() == n);
-                if ((int)c.GetNumRows() != m || (int)c.GetNumCols() != n)
-                    throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a.");
-
-                cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-                if (sizeof(ElemType) == sizeof(float))
-                {
-                    CUBLAS_CALL(cublasSaxpy(cuHandle,len,reinterpret_cast <float*>(&alpha),reinterpret_cast <float*>(a.m_pArray),incx,reinterpret_cast <float*>(c.m_pArray) ,incy));                
-                }
-                else if (sizeof(ElemType) == sizeof(double))
-                {   
-                    CUBLAS_CALL(cublasDaxpy(cuHandle,len,reinterpret_cast <double*>(&alpha),reinterpret_cast <double*>(a.m_pArray),incx,reinterpret_cast <double*>(c.m_pArray) ,incy)); 
-                }
-                else 
-                {
-                    throw std::runtime_error("Unsupported template argument in GPUMatrix"); 
-                }
-            }
-            else if (a.GetNumElements() == 1)
-            {
-                LONG64 N=(LONG64)c.GetNumElements();
-                int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-                c.PrepareDevice();
-                cudaEvent_t done = nullptr;
-                if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-                _scaleAndAddScalar<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, N, alpha, a.m_pArray);
-                if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-                if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-                if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-            }
-            else if (a.GetNumCols() == 1) //col vector, add it to all columns
-            {                
-                long m = (long)c.GetNumRows();
-                long n = (long)c.GetNumCols();                
-                if (m != (long)a.GetNumRows())
-                    throw std::invalid_argument("To add column vector, rows should match.");
-
-                cudaEvent_t done = nullptr;
-                int blocksPerGrid = (int)(ceil(1.0*m*n / threadsPerBlock));
-                if (do_sync)    CUDA_CALL(cudaEventCreate(&done));   
-#ifdef VALIDATION
-                printf(">>>> CUDA compute device is %d\n", a.GetComputeDeviceId());
-                printf(">>>> a.m_pArray = %p, c.m_pArray = %p, alpha = %f, m = %ld, n = %ld\n", a.m_pArray,c.m_pArray,alpha,m,n);   
-                for (int i=0; i < 2; i++)
-                {
-                    ElemType buffer[10] = {-1.234f};
-                    cudaError_t error = cudaMemcpy(buffer, !i?a.m_pArray:c.m_pArray, sizeof(buffer), cudaMemcpyKind::cudaMemcpyDeviceToHost);
-                    if (error == cudaError::cudaSuccess)
-                        printf("buffer valid\n"); 
-                }
-#endif
-
-                _matrixVectorColumnWiseAddWithThreadPerElem<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,c.m_pArray,alpha,m,n);
-
-
-                if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-                if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
-                if (do_sync)    CUDA_CALL(cudaEventDestroy(done));                
-            }
-            else  if (a.GetNumRows()==1)  //row vector, add it to all rows
-            {
-                cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-                int m = (int)c.GetNumRows();
-                int n = (int)c.GetNumCols();
-                assert (n == (int)a.GetNumCols());
-                if (n != (int)a.GetNumCols())
-                    throw std::invalid_argument("To add row vector, cols should match.");
-
-                if (sizeof(ElemType) == sizeof(double))
-                {
-                    foreach_row(i,c)
-                    {
-                        CUBLAS_CALL(cublasDaxpy(cuHandle,n,reinterpret_cast <double*>(&alpha),reinterpret_cast <double*>(a.m_pArray),1,reinterpret_cast <double*>(c.m_pArray+i),m));
-                    }                    
-                }
-                else
-                {
-                    foreach_row(i,c)
-                    {
-                        CUBLAS_CALL(cublasSaxpy(cuHandle,n,reinterpret_cast <float*>(&alpha),reinterpret_cast <float*>(a.m_pArray),1,reinterpret_cast <float*>(c.m_pArray+i),m));
-                    }                    
-                }
-            }
-            else
-                throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a.");
-        }
-    }
-
-    /// <summary>c += alpha * (a-b)</summary>
-    /// if a, b, c  must have same dim 
-    /// <param name="alpha">Scalar</param>
-    /// <param name="a">Input matrix</param>
-    /// <param name="b">Input matrix</param>
-    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-    template<class ElemType>
-    void GPUMatrix<ElemType>::AddScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {
-            a.PrepareDevice();
-
-            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
-                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols());
-
-            if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
-                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
-            {
-                throw std::invalid_argument("AddScaledDifference:  a, b, and c must have same dimension.");
-            }
-
-            if (a.IsEmpty())
-                throw std::logic_error("AddScaledDifference:  Input matrix a is empty.");
-
-            cudaEvent_t done = nullptr;
-            LONG64 n=(LONG64)a.GetNumElements();            
-            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    /// <summary> c = alpha * (a-b)</summary>
-    /// if a, b, c  must have same dim 
-    /// <param name="alpha">Scalar</param>
-    /// <param name="a">Input matrix</param>
-    /// <param name="b">Input matrix</param>
-    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-    template<class ElemType>    
-    void GPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {
-            a.PrepareDevice();
-
-            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() );
-
-            if (!(a.GetNumRows() == b.GetNumRows()  && a.GetNumCols() == b.GetNumCols()))
-            {
-                throw std::invalid_argument("AssignScaledDifference:  a, b must have same dimension.");
-            }
-
-            if (a.IsEmpty())
-                throw std::logic_error("AssignScaledDifference:  Input matrix a is empty.");
-
-            if (&c != &a && &c != &b)
-                c.Resize(a.GetNumRows(), a.GetNumCols());
-
-            cudaEvent_t done = nullptr;
-            LONG64 n=(LONG64)a.GetNumElements();            
-            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    /// <summary>c += alpha * (a-b)</summary>
-    /// if a, b, c  must have same dim 
-    /// <param name="alpha">1X1 matrix</param>
-    /// <param name="a">Input matrix</param>
-    /// <param name="b">Input matrix</param>
-    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-    template<class ElemType>
-    void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        assert(alpha.GetNumElements() == 1);
-        if (!(alpha.GetNumElements() == 1))
-            throw std::invalid_argument("AddScaledDifference:  alpha must be a 1X1 matrix.");
-
-        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {
-            a.PrepareDevice();
-
-            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
-                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols());
-
-            if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
-                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
-            {
-                throw std::invalid_argument("AddScaledDifference:  a, b, and c must have same dimension.");
-            }
-
-            if (a.IsEmpty())
-                throw std::logic_error("AddScaledDifference:  Input matrix a is empty.");
-
-            cudaEvent_t done = nullptr;
-            LONG64 n=(LONG64)a.GetNumElements();            
-            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    /// <summary> c = alpha * (a-b)</summary>
-    /// if a, b, c  must have same dim 
-    /// <param name="alpha">Scalar</param>
-    /// <param name="a">Input matrix</param>
-    /// <param name="b">Input matrix</param>
-    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-    template<class ElemType>    
-    void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        assert(alpha.GetNumElements() == 1);
-        if (!(alpha.GetNumElements() == 1))
-            throw std::invalid_argument("AddScaledDifference:  alpha must be a 1X1 matrix.");
-
-        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else
-        {
-            a.PrepareDevice();
-
-            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() );
-
-            if (!(a.GetNumRows() == b.GetNumRows()  && a.GetNumCols() == b.GetNumCols()))
-            {
-                throw std::invalid_argument("AssignScaledDifference:  a, b must have same dimension.");
-            }
-
-            if (a.IsEmpty())
-                throw std::logic_error("AssignScaledDifference:  Input matrix a is empty.");
-
-            c.Resize(a.GetNumRows(), a.GetNumCols());
-
-            cudaEvent_t done = nullptr;
-            LONG64 n=(LONG64)a.GetNumElements();            
-            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    //c[ci,cj] += a[ai,aj]
-    template<class ElemType>
-    void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
-    {
-        if (ai >= a.GetNumRows() || aj >=a.GetNumCols() ||
-            ci >= c.GetNumRows() || cj >=c.GetNumCols())
-            throw std::invalid_argument("AddElementToElement:  index out of range.");
-
-        a.PrepareDevice();
-        cudaEvent_t done = nullptr;
-        int blocksPerGrid=1;  //only one element
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
-    {   
-        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            float alph = (float)alpha;            
-            CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(float*)a.m_pArray,1));
-        }
-        else if (sizeof(ElemType)==sizeof(double))
-        {
-            double alph = alpha;
-            CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(double*)a.m_pArray,1));
-        }
-        else 
-        {
-            throw std::runtime_error("Unsupported template argument in GPUMatrix");            
-        }
-    }
-
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
-    {           
-        if (alpha.GetNumElements()!=1)
-        {
-            throw std::runtime_error("Matrix alpha must be 1x1");
-        }
-        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),(float*)alpha.m_pArray,(float*)a.m_pArray,1));
-        }
-        else if (sizeof(ElemType)==sizeof(double))
-        {            
-            CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),(double*)alpha.m_pArray,(double*)a.m_pArray,1));
-        }
-        else 
-        {
-            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
-            throw std::runtime_error("Unsupported template argument in GPUMatrix");            
-        }
-        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
-    }
-
-    template<class ElemType> //c = alpha * a
-    void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("Scale:  Input matrix a is empty.");
-
-        c=a;
-        Scale(alpha,c);
-    }
-
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::InnerProduct (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const bool isColWise)
-    {
-        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId() || b.GetComputeDeviceId()!=c.GetComputeDeviceId()) //different GPUs
-            throw std::invalid_argument("All matrices must be on the same GPU");
-
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("Scale:  one of the input matrices is empty.");
-
-        const int m = (int)a.GetNumRows();
-        const int n = (int)a.GetNumCols();
-        const int k = (int)b.GetNumRows();
-        const int l = (int)b.GetNumCols();
-
-        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
-        assert (m==k && n==l); //converting from size_t to int may cause overflow
-        if (m!=k || n!=l)
-            throw std::invalid_argument("Matrices a and b should have same dimension.");
-
-        if (isColWise)
-            c.Resize(1,n);
-        else
-            c.Resize(m,1);
-
-        if ((isColWise && m == 1) || !isColWise && n == 1)  //in this case it's equivalent to element-wise product
-        {
-            c.AssignElementProductOf(a, b);
-        }
-        else 
-        {
-            cudaEvent_t done = nullptr;  
-            c.PrepareDevice();
-
-            int blocksPerGrid=0;
-            if (isColWise)  //col-wise
-            {
-                c.Resize(1,n);   
-                blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                                        
-            }
-            else
-            {
-                c.Resize(m, 1);
-                blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                        
-            }       
-
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }             
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("InnerProductOfMatrices:  one of the input matrices is empty.");
-
-        const int m = (int)a.GetNumRows();
-        const int n = (int)a.GetNumCols();
-        const int k = (int)b.GetNumRows();
-        const int l = (int)b.GetNumCols();
-
-        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
-        assert (m==k && n==l); //converting from size_t to int may cause overflow
-        if (m!=k || n!=l)
-            throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
-
-        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-        if (sizeof(ElemType) == sizeof(double))
-        {
-            double tmp=0;                        
-            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1,&tmp));
-            return ElemType(tmp);
-            //return (ElemType)ddot((int)a.GetNumElements(), reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1);
-        }
-        else
-        {
-            float tmp=0;                        
-            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1,&tmp));
-            return tmp;
-            //return (ElemType)sdot((int)a.GetNumElements(), reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1);
-        }
-    }
-
-
-    template<class ElemType>
-    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("InnerProductOfMatrices:  one of the input matrices is empty.");        
-
-        Resize(1,1);
-
-        const int m = (int)a.GetNumRows();
-        const int n = (int)a.GetNumCols();
-        const int k = (int)b.GetNumRows();
-        const int l = (int)b.GetNumCols();
-
-        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
-        assert (m==k && n==l); //converting from size_t to int may cause overflow
-        if (m!=k || n!=l)
-            throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
-
-        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
-        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
-        if (sizeof(ElemType) == sizeof(double))
-        {   
-            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1,reinterpret_cast <double*>(m_pArray)));                    
-        }
-        else
-        {   
-            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1,reinterpret_cast <float*>(m_pArray)));                      
-        }
-        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
-        return *this;
-    }
-
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
-    {
-        if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else 
-        {
-            if (a.IsEmpty())
-                throw std::logic_error("ElementWisePower:  The input matrix a is empty.");
-            if (a.GetNumRows()!=c.GetNumRows() || a.GetNumCols()!=c.GetNumCols())
-                throw std::logic_error("ElementWisePower: matrices must be of the same size");
-
-            cudaEvent_t done = nullptr;
-            a.PrepareDevice();
-            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));            
-            LONG64 N=(LONG64)a.GetNumElements();
-            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
-            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        }
-    }
-
-    template<class ElemType>
-    bool GPUMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold /*= 1e-8*/)
-    {
-        if (a.IsEmpty() || b.IsEmpty())
-            throw std::logic_error("AreEqual: one of the input matrices is empty.");
-
-        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            return false;
-
-        a.PrepareDevice();
-        long *res = new long[1];
-        res[0]=1;
-        long *d_res = NULL;
-        CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(long)*1));
-        CUDA_CALL(cudaMemcpy(d_res,res,sizeof(long)*1,cudaMemcpyHostToDevice));
-        long N=(long)a.GetNumElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
-        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,b.m_pArray,N,threshold,d_res);
-        CUDA_CALL(cudaMemcpy(res,d_res,sizeof(long)*1,cudaMemcpyDeviceToHost));
-        if (res[0]!=0)
-            return true;
-        else
-            return false;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Ones(const size_t rows, const size_t cols)
-    {
-        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
-        c.SetValue(1);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Zeros(const size_t rows, const size_t cols)
-    {
-        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
-        //c.SetValue(0);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Eye(const size_t rows)
-    {
-        GPUMatrix<ElemType> c(rows, rows); //will initialize to 0
-        c.SetDiagonalValue(1);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType>  GPUMatrix<ElemType>::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed)
-    {
-        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
-        c.SetUniformRandomValue(low, high, seed);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUMatrix<ElemType>::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed)
-    {
-        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
-        c.SetGaussianRandomValue(mean, sigma, seed);
-        return c;
-    }
-
-    template<class ElemType>
-    ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemType> &Gradients, const GPUMatrix<ElemType> &SmoothedGradients)
-    {                
-        Gradients.PrepareDevice();
-        ElemType* d_res=NULL;
-        CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(ElemType))); //we allocate memory on the device
-
-        //Compute inner product of matrices and keep it on device
-        const int m = (int)Gradients.GetNumRows();
-        const int n = (int)Gradients.GetNumCols();
-        const int k = (int)SmoothedGradients.GetNumRows();
-        const int l = (int)SmoothedGradients.GetNumCols();
-        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
-        assert (m==k && n==l); //converting from size_t to int may cause overflow
-        if (m!=k || n!=l) throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
-
-        if (sizeof(ElemType) == sizeof(double))
-        {                 
-            cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId());
-            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
-            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(Gradients.m_pArray), 1, reinterpret_cast <double*>(SmoothedGradients.m_pArray), 1,reinterpret_cast <double*>(d_res)));
-            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
-        }
-        else
-        {            
-            cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId());
-            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
-            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(Gradients.m_pArray), 1, reinterpret_cast <float*>(SmoothedGradients.m_pArray), 1,reinterpret_cast <float*>(d_res)));
-            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
-        }
-        // d_res[0] should now contain inner product of matrices
-        // Compute squared Frobenius norms (squared sums of elements)       
-        _lrHelper<ElemType><<<1,512,0,t_stream>>>(Gradients.m_pArray,SmoothedGradients.m_pArray, (LONG64)Gradients.GetNumElements(), d_res);
-        ElemType res;
-        CUDA_CALL(cudaMemcpy(&res,d_res,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDA_CALL(cudaFree(d_res));
-        return res;
-    }
-
-#pragma endregion Static BLAS Functions
-
-
-    //#pragma region File << and >> operators
-    //    template<class ElemType>
-    //    File& operator>>(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.GetMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        size_t elsize;
-    //        stream>>elsize;
-    //        if (sizeof(ElemType)!=elsize)
-    //            throw std::runtime_error("Template argument size doesn't match those in file");
-    //        std::wstring matrixName;
-    //        size_t numRows, numCols;
-    //        stream>>matrixName>>numRows>>numCols;
-    //        ElemType* d_array = new ElemType[numRows*numCols];
-    //        for (long i=0;i<numRows*numCols;++i)
-    //            stream>>d_array[i];
-    //        stream.GetMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        us.SetValue(numRows,numCols,d_array, matrixFlagNormal);
-    //        us.m_matrixName = matrixName;
-    //        return stream;
-    //    }
-    //
-    //    template<class ElemType>
-    //    File& operator<<(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.PutMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        stream<<sizeof(ElemType)<<us.m_matrixName<<us.m_numRows<<us.m_numCols;
-    //        ElemType *d_array = us.CopyToArray();
-    //        for (long i=0;i<us.GetNumElements();++i)
-    //            stream<<d_array[i];
-    //        stream.PutMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        return stream;
-    //    }
-    //
-    //#pragma endregion File << and >> operators
-
-    template class GPUMatrix<float>; 
-    template class GPUMatrix<double>;
-    template class DeviceBoundNumber<float>;
-    template class DeviceBoundNumber<double>;
-
-    template<class ElemType>
-    cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
-
-    template<class ElemType>
-    void* GPUMatrix<ElemType>::s_curandGenerator=NULL;    
-}}}
-
-// !!!!This is from helper_cuda.h which comes with CUDA samples!!!! Consider if it is beneficial to just include all helper_cuda.h
-// TODO: This is duplicated in BestGpu.cpp
-// Beginning of GPU Architecture definitions
-int _ConvertSMVer2Cores(int major, int minor)
-{
-    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-    typedef struct
-    {
-        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-        int Cores;
-    } sSMtoCores;
-
-    sSMtoCores nGpuArchCoresPerSM[] =
-    {
-        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
-        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
-        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
-        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
-        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
-        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
-        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
-        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
-        {   -1, -1 }
-    };
-
-    int index = 0;
-
-    while (nGpuArchCoresPerSM[index].SM != -1)
-    {
-        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
-        {
-            return nGpuArchCoresPerSM[index].Cores;
-        }
-
-        index++;
-    }
-    return nGpuArchCoresPerSM[7].Cores;
-};
-// end of GPU Architecture definitions
-
-//inline long _GetFreeMemoryOnCUDADevice(int devId)
-//{   
-//    CUdevice cudaDevice;  
-//    CUresult result = cuDeviceGet(&cudaDevice, devId);  
-//    if(result!= CUDA_SUCCESS)  
-//    {          
-//        return 0;         
-//    }  
-//  
-//    //create cuda context  
-//    CUcontext cudaContext;    
-//    result = cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice);  
-//    if(result != CUDA_SUCCESS)  
-//    {          
-//        return 0;         
-//    }  
-//  
-//    //get the amount of free memory on the graphics card  
-//    size_t free;  
-//    size_t total;  
-//    result = cuMemGetInfo(&free, &total);  
-//    if (result!=CUDA_SUCCESS)
-//    {
-//        return 0;
-//    }
-//    else
-//        return (long)free;
-//}
-
-#endif // CPUONLY
+//
+// <copyright file="GPUMatrix.cu" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include "stdafx.h"
+#include "BestGpu.h"
+
+#ifndef CPUONLY
+
+#include "cublas_v2.h"
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include "device_launch_parameters.h"
+#include "GPUMatrix.h"
+#include "GPUMatrixCUDAKernels.cu"
+#include "GPUSparseMatrix.h"
+#include <iostream> // for cout
+
+#pragma comment (lib, "cudart.lib")     // instruct linker to reference these libs
+#pragma comment (lib, "cublas.lib")
+#pragma comment (lib, "cusparse.lib")
+#pragma comment (lib, "curand.lib")
+
+#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
+#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
+
+#ifdef NO_SYNC
+bool do_sync = false;
+#else
+bool do_sync = true;
+#endif
+
+#ifdef _WIN32
+// thread local storage to access the current stream, initalize to default stream
+__declspec (thread) 
+#endif
+cudaStream_t t_stream = cudaStreamDefault;
+
+extern int _ConvertSMVer2Cores(int major, int minor);   // forward declaration
+
+// SetStream - set the stream that will be used by the GPU routines
+void MATH_API SetStream(cudaStream_t stream)
+{
+    t_stream = stream;
+}
+
+// GetStream - get the stream that will be used by the GPU routines
+cudaStream_t MATH_API GetStream()
+{
+    return t_stream;
+}
+
+
+void CURAND_CALL(curandStatus x)
+{
+    if(x!=CURAND_STATUS_SUCCESS) 
+    { 
+        throw std::runtime_error("CURAND fail");
+    }        
+}
+
+void CUBLAS_CALL(cublasStatus_t x)
+{
+    if(x!=CUBLAS_STATUS_SUCCESS) 
+    { 
+        throw std::runtime_error("CUBLAS fail");
+    }
+}
+
+void CUDA_CALL(cudaError_t x) 
+{
+    if(x!=cudaSuccess) 
+    { 
+        const char* errmsg = cudaGetErrorString(x);
+        std::cerr << "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl;
+        cudaDeviceSynchronize();
+        throw std::runtime_error(errmsg);
+    }    
+}
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // PrepareDevice - Setup the correct cuda context for an operation
+    // deviceId - the device on which the operation will take place
+    void PrepareDevice(DEVICEID_TYPE deviceId)
+    {
+        static DEVICEID_TYPE currentDevice = AUTOPLACEMATRIX; // set to anything valid
+        // externally managed matrices are guaranteed to be on the right device
+        if (deviceId == MANAGEDEXTERN)
+            return;
+        // and if we last set the device to be this device we are good
+        if (deviceId == currentDevice)
+            return;
+        CUDA_CALL(cudaSetDevice(deviceId));
+        currentDevice=deviceId;
+    }
+
+#pragma region DeviceBoundNumber class
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::DeviceBoundNumber(const DeviceBoundNumber<ElemType> &/*deepCopy*/)
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::DeviceBoundNumber(DeviceBoundNumber<ElemType> &&shallowCopy)
+    {
+        ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice);
+        shallowCopy.m_data=NULL;
+    }
+
+    template<class ElemType>
+    void DeviceBoundNumber<ElemType>::ShallowCopyFrom(ElemType* newVal,int newValsDevceId)
+    {
+        m_computeDevice = newValsDevceId;
+        m_data = newVal;
+    }
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::~DeviceBoundNumber()
+    {
+        if (m_data!=NULL)
+        {
+            if (m_computeDevice<0)
+            {
+                delete m_data;
+                m_data = NULL;
+            }
+            else if (m_computeDevice != MANAGEDEXTERN)
+                CUDA_CALL(cudaFree(m_data));
+        }
+    }
+
+#pragma endregion DeviceBoundNumber class
+
+#pragma region Helper functions
+    template<class ElemType>    
+    cublasHandle_t _initCUBLAS(int devId)
+    {
+        PrepareDevice((DEVICEID_TYPE)devId);
+        cublasHandle_t cuHandle;
+        CUBLAS_CALL(cublasCreate(&cuHandle));
+        return cuHandle;
+    }
+
+    // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information
+    //  TODO: should be replaced by BestGpu class instead, it's much better
+    template<class ElemType>
+    DEVICEID_TYPE GPUMatrix<ElemType>::GetBestGPUDeviceId() //returns -1 if no GPUs can be used
+    {      
+        // currently there is little point in giving out different device IDs each time ask for a matrix, 
+        // we really want them all on the same device eventually
+        static int chosenDeviceId = AUTOPLACEMATRIX;
+        if (chosenDeviceId != AUTOPLACEMATRIX)
+            return chosenDeviceId;
+
+        __try
+        {
+            // stash previous device state
+            // if there was one on entry:
+            int nPrevDev = -1;
+            cudaError_t ePrevDev = cudaGetDevice(&nPrevDev);
+ 
+            int deviceCount = -1;
+            cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+            if (error_id != cudaSuccess || deviceCount==0) 
+            { 
+                return -1;            
+            }
+
+            int setDev = -1;
+            int curDev=0;
+            long curPower = 0;
+            for (DEVICEID_TYPE dev = 0; dev < deviceCount; ++dev)
+            {
+                CUDA_CALL(cudaSetDevice(dev));
+                setDev = dev;
+                cudaDeviceProp deviceProp;
+                cudaGetDeviceProperties(&deviceProp, dev);
+                long power = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount;
+                //long power = _GetFreeMemoryOnCUDADevice(dev);
+                if (power>curPower)
+                {
+                    curPower=power;
+                    curDev = dev;
+                }
+            }
+
+            if(nPrevDev >= 0 && ePrevDev == cudaSuccess && 
+                setDev >= 0 && setDev != nPrevDev) {
+                // restore current context to the one we entered with
+                // if there was one the caller might want unchanged.
+                cudaSetDevice(nPrevDev);
+            }
+            chosenDeviceId = curDev;
+            return curDev;
+        }
+        __except (1)
+        {
+            return -1; // CPU
+        }
+    }
+
+    // PrepareDevice - Setup the correct cuda context for an operation
+    // deviceId - the device on which the operation will take place
+    //            defaults to -1, which means use matrices current device
+    template<class ElemType>
+    DEVICEID_TYPE GPUMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const
+    {
+        // if default value use current compute device
+        DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice;
+
+        Microsoft::MSR::CNTK::PrepareDevice(newId);
+        return newId;
+    }
+
+    template<class ElemType>
+    ElemType* GPUMatrix<ElemType>::CopyToArray() const
+    {
+        size_t numElements = GetNumElements();
+        if (numElements != 0)
+        {
+            PrepareDevice();
+            ElemType* pArray = new ElemType[numElements];                    
+            CUDA_CALL(cudaMemcpy(pArray,m_pArray,sizeof(ElemType)*m_numRows*m_numCols,cudaMemcpyDeviceToHost));
+            return pArray;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+
+    //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done
+    //return number of elements copied
+    template<class ElemType>
+    size_t  GPUMatrix<ElemType>::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const
+    {
+        size_t numElements = GetNumElements();
+
+        if (numElements > currentArraySize)
+        {
+            delete arrayCopyTo;
+            arrayCopyTo = new ElemType[numElements];  
+            currentArraySize = numElements;
+        }
+
+        if (numElements != 0)
+        {
+            PrepareDevice();
+            CUDA_CALL(cudaMemcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements, cudaMemcpyDeviceToHost));
+        }
+
+        return numElements;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ChangeDeviceTo(DEVICEID_TYPE to_id)
+    {
+        if (!OwnBuffer())
+            throw std::logic_error("Cannot change device on Managed external matrix");
+        if (to_id == CPUDEVICE)
+            throw std::logic_error("to_id must be valid GPU");
+        if (m_computeDevice==to_id) 
+            return;
+
+        PrepareDevice((DEVICEID_TYPE)to_id);       
+        ElemType* d_dst=NULL;
+        CUDA_CALL(cudaMalloc((void**)&d_dst,sizeof(ElemType)*m_numRows*m_numCols));
+
+        m_elemSizeAllocated = m_numRows*m_numCols;
+
+        // check to make sure we have something to copy (on init we often have zero sized allocations)
+        if (m_elemSizeAllocated > 0)
+        {
+            // first try peer access
+            int canAccessPeer = false;
+            CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
+            if (canAccessPeer)
+            {
+                CUDA_CALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
+                CUDA_CALL(cudaMemcpyPeer(d_dst,to_id,m_pArray,m_computeDevice,sizeof(ElemType)*m_numRows*m_numCols));  
+            }
+            else
+            {
+                // peer access didn't work, just copy normal
+                // make this more efficient by keeping some buffers available for each copy
+                ElemType* h_dst=NULL;
+                PrepareDevice();
+                CUDA_CALL(cudaMallocHost((void**)&h_dst,sizeof(ElemType)*m_numRows*m_numCols));
+                CUDA_CALL(cudaMemcpy(h_dst,m_pArray,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyDeviceToHost));  
+                PrepareDevice((DEVICEID_TYPE)to_id);       
+                CUDA_CALL(cudaMemcpy(d_dst,h_dst,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyHostToDevice)); 
+                CUDA_CALL(cudaFreeHost(h_dst));  
+            }
+        }
+        PrepareDevice();
+        CUDA_CALL(cudaFree(m_pArray));
+        m_pArray=d_dst;
+
+        PrepareDevice((DEVICEID_TYPE)to_id);       
+        m_computeDevice=to_id;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::performInplaceFunction(int kind)    
+    {        
+        PrepareDevice();
+        LONG64 N= (LONG64) GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        switch (kind)
+        {
+        case 0:
+            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
+            break;
+        case 1:
+            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
+            break;
+        case 2:
+            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(m_pArray, N);
+            break;
+        case 3:
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        case 4:
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        case 5:
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        case 6:
+            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        case 7:
+            _inplaceCosineOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        case 8:
+            _inplaceNegativeSineOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+            break;
+        } 
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));       
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+
+#pragma endregion Helper functions
+
+#pragma region Constructors and Destructor
+
+   //should only be used by constructors.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ZeroInit(int deviceId)
+    {
+        m_computeDevice = deviceId;
+        m_pArray = nullptr;
+        m_numRows = 0;
+        m_numCols = 0;
+        m_elemSizeAllocated = 0;
+        m_matrixName=NULL;
+        m_format = matrixFormatDense; 
+        m_externalBuffer = false;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(int deviceId) 
+    {
+        if (deviceId == MANAGEDEXTERN)
+            throw std::logic_error("Basic constructor cannot be used with Managed Extern types");
+
+        ZeroInit(deviceId);
+    };
+
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(FILE* f, const char * matrixName, int deviceId)
+    {
+        if (deviceId == MANAGEDEXTERN)
+            throw std::logic_error("File constructor cannot be used with Managed Extern types");
+
+        ReadFromFile(f, matrixName);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId)
+    {
+        if (deviceId == MANAGEDEXTERN)
+            throw std::logic_error("constructor cannot be used with Managed Extern types");
+        ZeroInit(deviceId);
+        m_numRows = numRows;
+        m_numCols = numCols;
+        m_elemSizeAllocated = GetNumElements();
+
+        if (m_elemSizeAllocated != 0)
+        {
+            PrepareDevice();        
+            CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated));      
+        CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated));  
+        }
+    };
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId)
+    {
+        ZeroInit(deviceId);
+        SetValue(numRows, numCols, pArray, matrixFlags, deviceId);
+    };               
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const GPUMatrix<ElemType>& deepCopyFrom)
+    {
+        ZeroInit(deepCopyFrom.m_computeDevice);
+        SetValue(deepCopyFrom);
+        SetMatrixName(deepCopyFrom.m_matrixName);       
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(GPUMatrix<ElemType>&& moveFrom)
+    {
+        m_numRows = moveFrom.m_numRows;
+        m_numCols = moveFrom.m_numCols;
+        m_computeDevice = moveFrom.m_computeDevice;
+        m_pArray = moveFrom.m_pArray;  //shallow copy the pointer       
+        m_matrixName=moveFrom.m_matrixName;
+        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        m_format = moveFrom.m_format;
+        m_externalBuffer = moveFrom.m_externalBuffer;
+
+        //release the pointer from the source object so that the destructor won't release it twice
+        moveFrom.ZeroInit(0);       
+    }
+
+    //assignment operator, deep copy
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(const GPUMatrix<ElemType>& deepCopyFrom)  
+    {
+        if (this != &deepCopyFrom)
+        {
+            SetValue(deepCopyFrom);
+            SetMatrixName(deepCopyFrom.m_matrixName);       
+        }
+        return *this;
+    }
+
+    //move assignment operator, shallow copy
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(GPUMatrix<ElemType>&& moveFrom)  
+    {
+        if (this != &moveFrom)
+        {
+            if (OwnBuffer() && m_pArray!=NULL)
+            {
+                CUDA_CALL(cudaFree(m_pArray));  
+            }
+
+            m_numRows = moveFrom.m_numRows;
+            m_numCols = moveFrom.m_numCols;
+            m_elemSizeAllocated =  moveFrom.m_elemSizeAllocated;
+            m_pArray = moveFrom.m_pArray;
+            m_computeDevice = moveFrom.m_computeDevice;
+            m_format = moveFrom.m_format;
+            m_externalBuffer = moveFrom.m_externalBuffer;
+
+            //release the pointer from the source object so that the destructor won't release it twice
+            moveFrom.ZeroInit(0);
+        }
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::~GPUMatrix(void)
+    {
+        Clear();
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Clear()
+    {
+        if (OwnBuffer() && m_pArray!=NULL)
+        {
+            if (m_computeDevice>=0)
+            {            
+                PrepareDevice();
+                cudaFree(m_pArray);
+                m_pArray = NULL;
+                m_elemSizeAllocated = 0;
+            }        
+        }
+        BaseMatrix<ElemType>::Clear();
+
+        ZeroInit(m_computeDevice);
+    }
+#pragma endregion Constructors and Destructor 
+
+    template<class ElemType>
+    int GPUMatrix<ElemType>::GetComputeDeviceId() const 
+    {
+        // for externally managed memory the CUDA context will have the current device
+        if (m_computeDevice == MANAGEDEXTERN)
+        {
+            int devId;
+            assert(m_externalBuffer);
+            CUDA_CALL(cudaGetDevice(&devId));
+            return devId;
+        }
+        return m_computeDevice;
+    }
+
+#pragma region Basic Operators
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
+    {
+        if (numCols == 0)
+            throw std::logic_error("The slice cannot have 0 columns.");
+
+        if (startColumn + numCols > m_numCols)
+            throw std::logic_error("The slice is out of range of the source matrix.");
+            
+        GPUMatrix<ElemType> slice(m_numRows, numCols, m_pArray + startColumn * m_numRows, matrixFlagDontOwnBuffer, m_computeDevice);
+
+        return slice;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
+    {
+        if (numCols == 0)
+            throw std::logic_error("The slice cannot have 0 columns.");
+
+        if (startColumn + numCols > m_numCols)
+            throw std::logic_error("The slice is out of range of the source matrix.");
+        
+        Clear();
+
+        m_computeDevice=fromMatrix.m_computeDevice;
+        m_externalBuffer=true;
+        m_numRows = fromMatrix.m_numRows;
+        m_pArray=fromMatrix.m_pArray + startColumn * m_numRows;
+
+        m_elemSizeAllocated = GetNumElements();
+        m_matrixName=NULL;
+        m_format = fromMatrix.m_format;
+
+        return *this;
+    }     
+
+
+    //for each column of a, we assign numRows starting from startIndex to this
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignRowSliceValuesOf: input matrix a is empty.");
+
+        if (startIndex + numRows > a.GetNumRows())
+            throw std::logic_error("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
+
+        Resize(numRows, a.GetNumCols());
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignRowSliceValuesOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)numRows, (long)a.GetNumRows());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    //for the row slice of this starting from startIndex we add a to it.
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AddToRowSliceValuesOf: input matrix a is empty.");
+
+        if (a.GetNumRows() != numRows)
+            throw std::logic_error("AddToRowSliceValuesOf: a.GetNumRows() != numRows.");
+
+        if (startIndex + numRows > GetNumRows())
+            throw std::logic_error("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows().");
+
+        if (a.GetNumCols() != GetNumCols())
+            throw std::logic_error("AddToRowSliceValuesOf: columns does not match.");
+
+        LONG64 N=(LONG64)a.GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _addToRowSliceValuesOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    //for each column of this, we add row slice of a starting from startIndex
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AddWithRowSliceValuesOf: input matrix a is empty.");
+
+        if (GetNumRows() != numRows)
+            throw std::logic_error("AddWithRowSliceValuesOf: GetNumRows() != numRows.");
+
+        if (startIndex + numRows > a.GetNumRows())
+            throw std::logic_error("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
+
+        if (a.GetNumCols() != GetNumCols())
+            throw std::logic_error("AddWithRowSliceValuesOf: columns does not match.");
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _addWithRowSliceValuesOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
+    {
+        if (this == &a)
+            throw std::logic_error("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat.");
+
+        if (a.IsEmpty())
+            throw std::logic_error("AssignRepeatOf: Matrix a is empty.");
+
+        Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats);
+
+        LONG64 N = (LONG64)GetNumElements();
+        long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
+        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignRepeatOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, a.m_pArray, N, m, n, (long)GetNumRows());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+        
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::Transpose() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("Transpose: Matrix is empty.");
+
+        GPUMatrix<ElemType> c(GetComputeDeviceId());
+        c.AssignTransposeOf(*this);
+        return c;
+    }
+
+    // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU
+    // computeDevice - The compute device for which the cublas handle is desired
+    // returns: cublas handle
+    // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends
+    template<class ElemType>
+    cublasHandle_t GPUMatrix<ElemType>::GetCublasHandle(int computeDevice/*=-1*/)
+    {
+        // if the compute device is not passed, get the current device from CUDA
+        if (computeDevice < 0)
+            cudaGetDevice(&computeDevice);
+
+        if (computeDevice < 0 || computeDevice >= MaxGpus)
+            throw std::logic_error("GetCublasHandle: Maximum GPU exceeded");
+        cublasHandle_t cuHandle = s_cuHandle[computeDevice];
+        if (cuHandle == NULL)
+        {
+            s_cuHandle[computeDevice] = cuHandle = _initCUBLAS<ElemType>(computeDevice);
+        }
+        CUBLAS_CALL(cublasSetStream(cuHandle, t_stream));
+
+        return cuHandle;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf (const GPUMatrix<ElemType>& a)
+    {
+        if (this == &a)
+            throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose.");
+
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTransposeOf: Matrix a is empty.");
+
+        if (GetNumRows()!=a.GetNumCols() || GetNumCols()!=a.GetNumRows())
+            Resize(a.GetNumCols(), a.GetNumRows());
+
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        cublasOperation_t transA =  CUBLAS_OP_T;
+        cublasOperation_t transB =  CUBLAS_OP_T;
+        int m = (int)a.m_numCols;
+        int n = (int)a.m_numRows;                
+        ElemType alpha=1;
+        ElemType beta=0;
+        cublasStatus_t st;
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            st = cublasSgeam(cuHandle,transA,transB,m,n,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(m_pArray),(int)m_numRows);
+        }
+        else if (sizeof(ElemType)==sizeof(double))
+        {            
+            st = cublasDgeam(cuHandle,transA,transB,m,n,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(m_pArray),(int)m_numRows);
+        }
+        else  
+        {
+            throw std::runtime_error("Unsupported template argument in GPUMatrix"); 
+        }
+        if (st!=CUBLAS_STATUS_SUCCESS)
+        {
+            throw std::runtime_error("AssignTransposeOf failed");     
+        }
+        m_numRows=a.m_numCols;
+        m_numCols=a.m_numRows;
+        SetMatrixName(a.GetMatrixName());
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const ElemType v)
+    {
+        if (IsEmpty())
+            throw std::logic_error("SetValue: Matrix is empty.");
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _setValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,v,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory
+    {
+        if (IsEmpty())
+            throw std::logic_error("SetValue: Matrix is empty.");
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _setValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,d_v,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done)); 
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetColumn(const ElemType* colPointer, size_t colInd)
+    {
+        if (IsEmpty())
+            throw std::logic_error("SetValue: Matrix is empty.");
+        if (colPointer==NULL)
+            return;
+        CUDA_CALL(cudaMemcpy(m_pArray+LocateColumn(colInd),colPointer,sizeof(ElemType)*m_numRows,cudaMemcpyHostToDevice));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& deepCopyFrom)
+    {
+        if (this == &deepCopyFrom)
+            return;
+
+        Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols());
+        m_format = deepCopyFrom.m_format; // copy the format over just to be sure
+        size_t cpSize = deepCopyFrom.GetNumRows() * deepCopyFrom.GetNumCols();
+        if (cpSize != 0)
+            CUDA_CALL(cudaMemcpy(m_pArray,deepCopyFrom.m_pArray,cpSize*sizeof(ElemType),cudaMemcpyDeviceToDevice));        
+    }
+
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId)
+    {
+        // handle externally managed case
+        if (matrixFlags&matrixFlagDontOwnBuffer)
+        {
+            // free the existing array if it used to be an owned array
+            if (OwnBuffer() && m_pArray!=NULL)
+            {
+                PrepareDevice();
+                CUDA_CALL(cudaFree(m_pArray));
+            }
+            m_numRows = numRows;
+            m_numCols = numCols;
+            m_pArray = pArray;
+            m_elemSizeAllocated = GetNumElements();
+            m_matrixName = NULL;
+            m_format = matrixFormatDense;
+            m_externalBuffer = true;
+            m_computeDevice = deviceId;
+        }
+        else 
+        {
+            // if didn't previously own the buffer, wipe it clean 
+            if (!OwnBuffer())
+            {
+                ZeroInit(deviceId);
+            }
+
+            // if the devices are different move it now
+            if (m_computeDevice != deviceId && deviceId >= 0)
+            {
+                Clear();
+                ZeroInit(deviceId);
+            }
+
+            // now resize/allocate as necessary
+            Resize(numRows, numCols);
+            m_externalBuffer = false;
+
+            // copy over the content to the buffer
+            PrepareDevice();
+            if (pArray!=NULL) 
+            {
+                if (!(matrixFlags&matrixFormatRowMajor))
+                {
+                    CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(), 
+                        (matrixFlags&matrixFlagSetValueOnDevice)?cudaMemcpyDeviceToDevice:cudaMemcpyHostToDevice));
+                }
+                else
+                {
+                    throw std::runtime_error("Row major isn't implemented");
+                }
+            }
+        }
+        m_format = matrixFormatDense;
+    }
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetDiagonalValue(const ElemType v)
+    {
+        unsigned long N=(unsigned long)GetNumRows();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _setDiagonalValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,v,N,(unsigned long)GetNumRows());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetDiagonalValue(GPUMatrix<ElemType>& vector)
+    {
+        if (IsEmpty() || vector.IsEmpty())
+            throw std::logic_error("SetDiagonalValue: Matrix is empty.");
+
+        if (GetNumRows() != GetNumCols())
+            throw std::logic_error("SetDiagonalValue: NumRows and NumCols do not agree.");
+
+        if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1)
+            throw std::logic_error("SetDiagonalValue: input vector must be a vector.");
+
+        if (vector.GetNumElements() == 1) //reduce to simple form
+            SetDiagonalValue(vector.m_pArray[0]);
+
+        else if (vector.GetNumRows() != GetNumRows())
+            throw std::logic_error("SetDiagonalValue: input vector's dimension does not agree with [this].");
+        else
+        {
+            long N=(long)GetNumRows();
+            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+            PrepareDevice();
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+            _setDiagonalValueFromVector<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,vector.m_pArray,N);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
+    {
+        PrepareDevice();
+        if (s_curandGenerator==NULL)
+        {            
+            s_curandGenerator = new curandGenerator_t;
+            /* Create pseudo-random number generator */        
+            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
+            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
+        }
+
+        cudaEvent_t done = nullptr;
+        CUDA_CALL(cudaEventCreate(&done));
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<float*>(m_pArray), GetNumElements()));
+        }
+        else
+        {
+            CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<double*>(m_pArray), GetNumElements()));
+        }
+        CUDA_CALL(cudaEventRecord(done));        
+        CUDA_CALL(cudaEventSynchronize(done)); 
+        //CURAND_CALL(curandDestroyGenerator(gen));
+        CUDA_CALL(cudaEventDestroy(done));
+
+        size_t N=GetNumElements();
+        size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock);
+
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _rescaleToRange<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N,low,high);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
+    {
+        PrepareDevice();
+        if (s_curandGenerator==NULL)
+        {            
+            s_curandGenerator = new curandGenerator_t;
+            /* Create pseudo-random number generator */        
+            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
+            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
+        }
+
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CURAND_CALL(curandGenerateNormal(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<float*>(m_pArray), GetNumElements(), (float)mean, (float)sigma));
+        }
+        else
+        {
+            CURAND_CALL(curandGenerateNormalDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast<double*>(m_pArray), GetNumElements(), (double)mean, (double)sigma));
+        }
+        //CURAND_CALL(curandDestroyGenerator(gen));
+    }
+
+    //maskRate: percentage of values masked out (similar to dropout rate)
+    //scaleValue: which scale value to set to the left ones (unmasked items).
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
+    {
+        PrepareDevice();
+        if (s_curandGenerator==NULL)
+        {            
+            s_curandGenerator = new curandGenerator_t;
+            /* Create pseudo-random number generator */        
+            CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW));        
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed));       
+            CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED));
+        }
+
+        cudaEvent_t done = nullptr;
+        CUDA_CALL(cudaEventCreate(&done));
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CURAND_CALL(curandGenerateUniform((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast<float*>(m_pArray), GetNumElements()));
+        }
+        else
+        {
+            CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast<double*>(m_pArray), GetNumElements()));
+        }
+        CUDA_CALL(cudaEventRecord(done));        
+        CUDA_CALL(cudaEventSynchronize(done)); 
+        CUDA_CALL(cudaEventDestroy(done));
+        //CURAND_CALL(curandDestroyGenerator(gen));
+
+        size_t N=GetNumElements();
+        size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock);        
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _setMaskAndScale<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N,maskRate,scaleValue);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& gradients)
+    {
+        if (IsEmpty())
+        {
+            Resize(gradients.GetNumRows(), gradients.GetNumCols());
+            SetValue(0.0);
+        }
+
+        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols());
+
+        int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock;
+        _adagrad<ElemType><<<blocksPerGrid, threadsPerBlock>>>(m_pArray, gradients.m_pArray, GetNumElements());
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients,
+        ElemType RMS_GAMMA,
+        ElemType RMS_WGT_INC,
+        ElemType RMS_WGT_MAX,
+        ElemType RMS_WGT_DEC,
+        ElemType RMS_WGT_MIN
+        )
+    {
+        const ElemType floor = 1e-6f;
+        static ElemType *upd_gpu = (ElemType*)0;
+
+        size_t n = gradients.GetNumElements();
+        int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock;
+
+        if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3)
+        {
+            Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3);
+            SetValue(0.0);
+
+            ElemType *avars=m_pArray; // accumulated variances for RMS scaling
+            ElemType *signs=m_pArray+n; // sign of previous gradient
+            ElemType *steps=m_pArray+2*n; // current step size
+
+            _rmsprop_init<ElemType><<<blocksPerGrid, threadsPerBlock>>>(avars,signs,steps,gradients.m_pArray,n);
+
+        }
+
+        ElemType *avars=m_pArray; // accumulated variances for RMS scaling
+        ElemType *signs=m_pArray+n; // sign of previous gradient
+        ElemType *steps=m_pArray+2*n; // current step size
+
+        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3);
+
+        if( !upd_gpu )
+        {
+            ElemType upd[] = {
+                2,2,0,
+                2,2,0,
+                1,1,1,
+                2,2,0,
+                1,2,1,
+                0,2,2,
+                1,1,1,
+                0,2,2,
+                0,2,2,
+            };
+
+            CUDA_CALL(cudaMalloc((void**)&upd_gpu,sizeof(ElemType)*27));
+            CUDA_CALL(cudaMemcpy(upd_gpu,upd,sizeof(ElemType)*27,cudaMemcpyHostToDevice));
+        }
+
+        _rmsprop<ElemType><<<blocksPerGrid, threadsPerBlock>>>(avars,signs,steps,gradients.m_pArray,n,
+            RMS_GAMMA,RMS_WGT_INC,RMS_WGT_MAX,RMS_WGT_DEC,RMS_WGT_MIN,
+            floor,upd_gpu);
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
+    {
+        assert (numRows*numCols == GetNumElements());
+        if (numRows*numCols != GetNumElements())
+            throw std::invalid_argument("Reshape: total number of elements does not match.");
+
+        m_numRows = numRows;
+        m_numCols = numCols;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+    {
+        if (m_numRows==numRows && m_numCols==numCols)
+            return;   
+
+        m_numRows = numRows;
+        m_numCols = numCols;
+
+        size_t numElements = GetNumElements();
+        if (numElements > m_elemSizeAllocated || (!growOnly && numElements != m_elemSizeAllocated))
+        {
+            if (IsEmpty())
+            {
+                m_elemSizeAllocated = 0;
+                m_pArray = NULL;
+            }
+            else
+            {            
+                if (!OwnBuffer())
+                    throw std::invalid_argument("Can't resize a externally managed matrix");
+                PrepareDevice();
+                if (m_pArray!=NULL)
+                    CUDA_CALL(cudaFree(m_pArray)); //delete and reallocate                            
+                m_elemSizeAllocated = numElements;
+                CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated));
+                CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated));
+            }
+        }
+    }
+
+    template<class ElemType>
+    size_t GPUMatrix<ElemType>::LocateElement (const size_t row, const size_t col) const 
+    { 
+        assert (row < m_numRows && col < m_numCols); 
+        return col * m_numRows  + row;  // matrix in column-wise storage
+    }  
+
+    template<class ElemType>
+    size_t GPUMatrix<ElemType>::LocateColumn (const size_t col) const 
+    { 
+        assert (col < m_numCols); 
+        return col * m_numRows;  // matrix in column-wise storage
+    }  
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::Get00Element() const 
+    {        
+        ElemType res=0;        
+        CUDA_CALL(cudaMemcpy(&res,m_pArray,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        return res;
+    }
+#pragma endregion Basic Operators
+
+#pragma region Member BLAS Functions
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (ElemType alpha) 
+    {
+        if (IsEmpty())
+            throw std::logic_error("operator+=: Matrix is empty.");
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (ElemType alpha) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("operator+: Matrix is empty.");
+
+        const GPUMatrix<ElemType>& us=*this;
+        GPUMatrix<ElemType> c(us);
+        c+=alpha;
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        (*this)+=alpha;
+        return (*this);
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (const GPUMatrix<ElemType>& a) 
+    {
+        //if (a.GetNumElements()==1)
+        //{
+        //    //*this += a.Get00Element();
+        //    LONG64 N=(LONG64)GetNumElements();
+        //    int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        //    cudaEvent_t done = nullptr;
+        //    if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
+        //    if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        //    if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        //    if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        //}
+        //else 
+        //{
+            ScaleAndAdd(1, a, *this);
+        //}
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (const GPUMatrix<ElemType>& a) const
+    {
+        if (GetNumElements()==1)
+        {
+            GPUMatrix<ElemType> c(a);
+            c+=Get00Element();
+            return c;
+        }
+        else if (a.GetNumElements()==1)
+        {
+            GPUMatrix<ElemType> c(*this);
+            c+=a.Get00Element();
+            return c;
+        }
+        else
+        {
+            GPUMatrix<ElemType> c(*this); //this implementation will introduce a copy overhead. but make resue of the code
+            c += a;
+            return c;
+        }
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        SetValue(a);
+        (*this)+=b;
+        return (*this);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (ElemType alpha) 
+    {
+        if (IsEmpty())
+            throw std::logic_error("operato-=: Matrix is empty.");
+        return operator+=(-1*alpha);        
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (ElemType alpha) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("operator-: Matrix is empty.");
+        return operator+(-1*alpha);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        Resize(a.m_numRows,a.m_numCols);
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignDifferenceOf1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,a.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+        /*Resize(a.m_numRows,a.m_numCols);
+        SetValue(alpha);
+        (*this)-=a;
+        return *this;*/
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const ElemType alpha)
+    {
+        Resize(a.m_numRows,a.m_numCols);
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignDifferenceOf2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,alpha,a.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+        /*SetValue(a);
+        (*this)-=alpha;
+        return *this;*/
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (const GPUMatrix<ElemType>& a)
+    {
+        //if (a.GetNumElements() == 1)
+        //    AssignDifferenceOf(*this, a.Get00Element());
+        //else if (GetNumElements() == 1)
+        //    AssignDifferenceOf(Get00Element(), a);
+        //else
+            ScaleAndAdd(-1, a, *this);
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (const GPUMatrix<ElemType>& a) const
+    {
+        GPUMatrix<ElemType> c(*this); //this implementation will introduce a copy overhead. but make resue of the code
+        c -= a;
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (this != &a)
+        {
+            Resize(a.GetNumRows(), a.GetNumCols());
+            SetValue(a);
+        }
+        (*this) -= b;
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator*= (ElemType alpha)
+    {
+        Scale(alpha, *this);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (ElemType alpha) const
+    {
+        GPUMatrix<ElemType> c(GetNumRows(), GetNumCols());
+        Scale(alpha, *this, c);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        Scale(alpha, a, *this);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf (const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB)
+    {
+        if (a.GetNumElements() == 1)
+        {  
+            if (transposeB)
+                AssignTransposeOf(b);
+            (*this) *= a.Get00Element();
+        }
+        else if (b.GetNumElements() == 1)
+        { 
+            if (transposeA)
+                AssignTransposeOf(a);
+            (*this) *= b.Get00Element();
+        }
+        else
+            Multiply(a, transposeA, b, transposeB, *this);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (const GPUMatrix<ElemType>& a) const
+    {
+        const GPUMatrix<ElemType>& us = *this;
+        if (GetNumElements() == 1)
+        {
+            GPUMatrix<ElemType> c(GetComputeDeviceId());
+            c.AssignProductOf(Get00Element(), a);
+            return c;
+        }
+        else if (a.GetNumElements() == 1)
+        {
+            GPUMatrix<ElemType> c(GetComputeDeviceId());
+            c.AssignProductOf(a.Get00Element(), us);
+            return c;
+        }
+        else
+        {
+            GPUMatrix<ElemType> c(GetNumRows(),a.GetNumCols(),GetComputeDeviceId());
+            Multiply(*this, a, c);
+            return c;
+        }
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator/= (ElemType alpha)
+    {
+        (*this) *= 1/alpha;
+        return (*this);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator/ (ElemType alpha) const
+    {
+        return ((*this) * (1/alpha));
+    }
+
+    //element-wise power
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator^= (ElemType alpha)
+    {
+        GPUMatrix<ElemType>& us = *this;
+        ElementWisePower(alpha, us, us);
+        return us;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator^ (ElemType alpha) const
+    {
+        GPUMatrix<ElemType> c(GetNumRows(), GetNumCols());
+        ElementWisePower(alpha, *this, c);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementPowerOf(const GPUMatrix<ElemType>& a, const ElemType power)
+    {
+        ElementWisePower(power, a, *this);
+        return *this;
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AddElementProductOf: Matrix is empty.");
+
+        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
+        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
+            throw std::invalid_argument("The input matrix dimensions do not match.");
+
+        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols()))
+            throw std::invalid_argument("The input matrix dimensions do not match [this].");
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);    
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ColumnElementMultiplyWith(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty() || IsEmpty())
+            throw std::logic_error("ColumnElementMultiplyWith: Matrix is empty.");
+
+        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
+            throw std::invalid_argument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows.");
+
+        long N=(long)a.GetNumRows();
+        long M=(long)GetNumCols();        
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);  
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N,M);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::RowElementMultiplyWith(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty() || IsEmpty())
+            throw std::logic_error("RowElementMultiplyWith: Matrix is empty.");
+
+        if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()))
+            throw std::invalid_argument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns.");
+
+        long N = (long)GetNumRows();
+        long M = (long)a.GetNumCols();
+        int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock);
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.m_pArray,N,M);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::RowElementDivideBy(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty() || IsEmpty())
+            throw std::logic_error("RowElementDivideBy: Matrix is empty.");
+
+        if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()))
+            throw std::invalid_argument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns.");
+
+        long N = (long)GetNumRows();
+        long M = (long)a.GetNumCols();
+        int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock);
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _rowElementDivideBy<ElemType> << <blocksPerGrid, threadsPerBlock >> >(m_pArray, a.m_pArray, N, M);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ColumnElementDivideBy(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty() || IsEmpty())
+            throw std::logic_error("ColumnElementDivideBy: Matrix is empty.");
+
+        if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
+            throw std::invalid_argument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows.");
+
+        long N = (long)a.GetNumRows();
+        long M = (long)GetNumCols();
+        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _ColumnElementDivideBy<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N,M);                        
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementInverse ()
+    {
+        if (IsEmpty())
+            throw std::logic_error("ElementInverse: Matrix is empty.");
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);  
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));     
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementInverseOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        return ElementInverse();
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoid()
+    {
+        performInplaceFunction(0);                    
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidOf (const GPUMatrix<ElemType>& a)
+    {
+        Resize(a.GetNumRows(),a.GetNumCols());
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignSigmoidOf<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        /*SetValue(a);
+        InplaceSigmoid();*/
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoidDerivative()
+    {
+        AssignSigmoidDerivativeOf(*this);                    
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidDerivativeOf (const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignSigmoidDerivativeOf: Matrix a is empty.");
+
+        //auto& us=*this;
+        if (this != &a)
+            Resize(a.GetNumRows(), a.GetNumCols());
+
+        PrepareDevice();
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+
+        _assignSigmoidDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, m_pArray, N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTanh()
+    {
+        performInplaceFunction(1);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTanhOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceTanh();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLogSoftmax (const bool isColWise)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceLogSoftmax: Matrix is empty.");
+
+        PrepareDevice();
+        if (isColWise)
+        {
+            long N=(long)GetNumCols(); //one kernel per column
+            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);             
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+            _logSoftMaxColWise<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,(long)m_numCols,(long)m_numRows);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+        else
+        {
+            long N=(long)GetNumRows(); //one kernel per column
+            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+            _logSoftMaxRowWise<<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,(long)m_numCols,(long)m_numRows);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+        return *this; 
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogSoftmaxOf (const GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        Resize(a.GetNumRows(),a.GetNumCols());        
+        if (isColWise)
+        {            
+            PrepareDevice();
+            long N = (long)GetNumCols();
+            long M = (long)GetNumRows();
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+            _assignColumnwiseLogSoftmaxOf<<<N,512,0,t_stream>>>(a.m_pArray,m_pArray,N,M);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSqrt()
+    {
+        performInplaceFunction(2);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSqrtOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceSqrt();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceExp()
+    {
+        performInplaceFunction(3);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignExpOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceExp();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLog()
+    {
+        performInplaceFunction(4);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceLog();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceAbs()
+    {
+        performInplaceFunction(5);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignAbsOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceAbs();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLinearRectifierDerivative()
+    {
+        performInplaceFunction(6);                    
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceLinearRectifierDerivative();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceCosine()
+    {
+        performInplaceFunction(7);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCosineOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceCosine();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceNegativeSine()
+    {
+        performInplaceFunction(8);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNegativeSineOf (const GPUMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceNegativeSine();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");    
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateBottomOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty.");
+
+        if (this!=&a)
+        {
+            Resize(a.GetNumRows(), a.GetNumCols());
+        }
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);      
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,threshold,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncateTop: Matrix is empty.");
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);      
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;        
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateTopOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTruncateTopOf: Matrix a is empty.");
+
+        if (this!=&a)
+        {
+            Resize(a.GetNumRows(), a.GetNumCols());
+        }
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,threshold,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;        
+    }
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty.");
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); 
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;  
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfAbsElements: Matrix is empty");
+
+        cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId());          
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            float res=0;
+            cublasSasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<float*>(m_pArray),1,&res);
+            return res;
+        }
+        else
+        {
+            double res=0;
+            cublasDasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<double*>(m_pArray),1,&res);
+            return ElemType(res);
+        }         
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::SumOfElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfElements: Matrix is empty");
+
+        PrepareDevice();
+        ElemType* d_sum = NULL;
+        ElemType h_sum;
+        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements());
+        CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaFree(d_sum));               
+        return h_sum;        
+    }
+
+    
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignSumOfElements: Matrix a is empty");
+
+        Resize(1,1);
+
+        PrepareDevice();     
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSumAndAssign<ElemType><<<1,1024>>>(m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)GetNumElements());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return (*this);
+    }
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("Matrix is empty");
+        PrepareDevice();
+        ElemType* d_sum = NULL;        
+        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements());
+        DeviceBoundNumber<ElemType> result;
+        result.ShallowCopyFrom(d_sum,GetComputeDeviceId());
+        return result;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::Max() const
+    {
+        cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId());   
+        ElemType res;
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            int resInd=0;
+            cublasIsamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<float*>(m_pArray),1,&resInd); 
+            resInd--;
+            CUDA_CALL(cudaMemcpy(reinterpret_cast<float*>(&res),reinterpret_cast<float*>(m_pArray+resInd),sizeof(float),cudaMemcpyDeviceToHost));
+            return res;
+        }
+        else
+        {
+            int resInd=0;
+            cublasIdamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast<double*>(m_pArray),1,&resInd);
+            resInd--;
+            CUDA_CALL(cudaMemcpy(reinterpret_cast<double*>(&res),m_pArray+resInd,sizeof(float),cudaMemcpyDeviceToHost));
+            return res;
+        }        
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementMultiplyWith (const GPUMatrix<ElemType>& a)
+    {
+        if (IsEmpty() || a.IsEmpty())
+            throw std::logic_error("ElementMultiplyWith: Matrix is empty.");
+
+        GPUMatrix<ElemType>& us=*this;
+        assert (us.GetNumRows() == a.GetNumRows() && us.GetNumCols() == a.GetNumCols());
+        if (us.GetNumRows() != a.GetNumRows() || us.GetNumCols() != a.GetNumCols())
+            throw std::invalid_argument("The matrix dimensions do not match.");
+
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); 
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AssignElementProductOf: Matrix is empty.");
+
+        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
+        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
+            throw std::invalid_argument("The input matrix dimensions do not match.");
+
+        Resize(a.GetNumRows(), a.GetNumCols());
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock);  
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementDivideBy(const GPUMatrix<ElemType>& a)
+    {
+        return AssignElementDivisionOf(*this, a);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementDivisionOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AssignElementDivisionOf: Matrix is empty.");
+
+        assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
+        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
+            throw std::invalid_argument("The input matrix dimensions do not match.");
+
+        Resize(a.GetNumRows(), a.GetNumCols());
+        LONG64 N=(LONG64)GetNumElements();
+        int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock);  
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,N);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    bool GPUMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
+    {
+        return AreEqual(*this, a, threshold);
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNorm1(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("VectorNorm1: Matrix is empty.");
+
+        const long n = (long)GetNumRows();
+        const long m = (long)GetNumCols();
+        assert (m>0 && n>0); //converting from size_t to int may cause overflow
+
+        cudaEvent_t done = nullptr;  
+        PrepareDevice();
+        c.ChangeDeviceTo(GetComputeDeviceId());
+
+        int blocksPerGrid=0;
+        if (isColWise)  //col-wise
+        {
+            c.Resize(1,m);   
+            blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                                        
+        }
+        else
+        {
+            c.Resize(n, 1);
+            c.ChangeDeviceTo(GetComputeDeviceId());
+            blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                        
+        }       
+
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
+        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, m_pArray,n,m,isColWise);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm1Of(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        a.VectorNorm1(*this, isColWise);
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNorm2(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("VectorNorm2: Matrix is empty.");
+
+        const long n = (long)GetNumRows();
+        const long m = (long)GetNumCols();
+        assert (m>0 && n>0); //converting from size_t to int may cause overflow
+
+        cudaEvent_t done = nullptr;  
+        PrepareDevice();
+        c.ChangeDeviceTo(GetComputeDeviceId());
+
+        int blocksPerGrid=0;
+        if (isColWise)  //col-wise
+        {
+            c.Resize(1,m);   
+            blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                                        
+        }
+        else
+        {
+            c.Resize(n, 1);
+            c.ChangeDeviceTo(GetComputeDeviceId());
+            blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                        
+        }       
+
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
+        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, m_pArray,n,m,isColWise);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm2Of(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        a.VectorNorm2(*this, isColWise);
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNormInf(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("VectorMax: Matrix is empty.");
+
+        //this implementation is not efficient
+        GPUMatrix<ElemType> tmp;
+        GPUMatrix<ElemType> tmp1;
+        tmp.AssignAbsOf((*this));
+        tmp.VectorMax(tmp1,c,isColWise);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNormInfOf(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        a.VectorNormInf(*this, isColWise);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool isColWise)
+    {
+        InnerProduct (a, b, *this,isColWise);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignKhatriRaoProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AssignKhatriRaoProductOf: Matrix is empty.");
+
+        long cols = a.GetNumCols();
+        assert (cols == b.GetNumCols());
+        if (!(cols == b.GetNumCols()))
+            throw std::invalid_argument("AssignKhatriRaoProductOf: The input matrix dimensions do not match.");
+
+        long rowsA = (long)a.GetNumRows();
+        long rowsB = (long)b.GetNumRows();
+        Resize(rowsA * rowsB, cols);
+        float N=(float)GetNumElements();
+        int blocksPerGrid =(int)ceil(N/threadsPerBlock);  
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient
+    //   this = reshape each column of a from (K1xK2,1) to (K1, K2) 
+    //   if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames).
+    //   the output is a (K1, frames) matrix
+    //   if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames)
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddColumnReshapeProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool transposeAColumn)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AddColumnReshapeProductOf: Matrix is empty.");
+
+        long cols = a.GetNumCols();
+        assert (cols == b.GetNumCols());
+        if (!(cols == b.GetNumCols()))
+            throw std::invalid_argument("AddColumnReshapeProductOf: The input matrix dimensions do not match.");
+
+        long rowsA = (long)a.GetNumRows();
+        long rowsB = (long)b.GetNumRows();
+        if (rowsA % rowsB != 0)
+            throw std::invalid_argument("AddColumnReshapeProductOf: number of rows in a should be multiples of that in b.");
+
+        long rowsC = rowsA / rowsB;
+        if (rowsC != GetNumRows() || cols != GetNumCols())
+            throw  std::invalid_argument("AddColumnReshapeProductOf: This matrix does not have the right size.");
+
+        float N=(float)GetNumElements();
+        int blocksPerGrid =(int)ceil(N/threadsPerBlock);  
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithScaleOf(ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        ScaleAndAdd(alpha, a, *this);
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::FrobeniusNorm() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
+
+        PrepareDevice();
+        ElemType* d_sum = NULL;
+        ElemType h_sum=0;
+        CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum2<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements(), true);
+        CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaFree(d_sum));               
+
+        return (h_sum); 
+    }
+    
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf (const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignFrobeniusNormOf: Matrix a is empty.");
+
+        Resize(1,1);        
+    
+        PrepareDevice();
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum2<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray,m_pArray,(LONG64)a.GetNumElements(), true);
+
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNormInf() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("MatrixNorm1: Matrix is empty.");
+
+        PrepareDevice();
+        ElemType* d_maxAbs = NULL;
+        ElemType h_maxAbs=0;
+        CUDA_CALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionMatrixNormInf<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_maxAbs,(LONG64)GetNumElements());
+        CUDA_CALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaFree(d_maxAbs));               
+        return h_maxAbs; 
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNorm1() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("MatrixNorm1: Matrix is empty.");
+        return SumOfAbsElements();              
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNorm0() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("MatrixNorm0: Matrix is empty.");
+
+        PrepareDevice();
+        ElemType* d_nz = NULL;
+        ElemType h_nz=0;
+        CUDA_CALL(cudaMalloc((void**)&d_nz,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionMatrixNorm0<ElemType><<<1,1024,0,t_stream>>>(m_pArray,d_nz,(LONG64)GetNumElements());
+        CUDA_CALL(cudaMemcpy(&h_nz,d_nz,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaFree(d_nz));               
+        return h_nz; 
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSignOf(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignSignOf: Matrix a is empty.");
+
+        if (this != &a)
+            Resize(a.GetNumRows(), a.GetNumCols());
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock);  
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, (long)GetNumElements());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddSignOf(const GPUMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AddSignOf: Matrix a is empty.");
+
+        if (this != &a)
+            Resize(a.GetNumRows(), a.GetNumCols());
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock);  
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray, a.m_pArray, (LONG64)GetNumElements());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("VectorMax: Matrix is empty.");
+
+        const GPUMatrix<ElemType>& us=*this;
+        const long m = (long)GetNumRows();
+        const long n = (long)GetNumCols();
+        assert (m>0 && n>0); //converting from size_t to int may cause overflow
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)     CUDA_CALL(cudaEventCreate(&done));                
+        if (isColWise)
+        {
+            maxValues.Resize(1, n);
+            maxIndexes.Resize(1, n);
+
+            int blocksPerGrid = n; //we'll have 1 block processing 1 column
+            _vectorMaxMinReduce<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,true);
+
+            /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);*/
+        }
+        else
+        {
+            maxValues.Resize(m, 1);
+            maxIndexes.Resize(m, 1);
+            int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock);  
+            _vectorMax<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);
+        }
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const
+    {
+        if (IsEmpty())
+            throw std::logic_error("VectorMax: Matrix is empty.");
+
+        const GPUMatrix<ElemType>& us=*this;
+        const int m = (int)GetNumRows();
+        const int n = (int)GetNumCols();
+
+        assert (m>0 && n>0); //converting from size_t to int may cause overflow
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));                
+        if (isColWise)
+        {
+            minValues.Resize(1, n);
+            minIndexes.Resize(1, n);
+
+            int blocksPerGrid = n; //we'll have 1 block processing 1 column
+            _vectorMaxMinReduce<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,false);
+
+            /*
+            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            _vectorMin<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);*/
+        }
+        else
+        {
+            minValues.Resize(m, 1);
+            minIndexes.Resize(m, 1);
+            int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock);  
+            _vectorMin<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);
+        }
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            throw std::invalid_argument ("AssignNumOfDiff: a and b must have same dimension.");
+
+        Resize(1,1); //result should be one element
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements());
+        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (LONG64)a.GetNumElements());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+#pragma endregion Member BLAS Functions    
+
+#pragma region Other helper functions
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Print(const char* /*matrixName*/, size_t /*rowStart*/, size_t /*rowEnd*/, size_t /*colStart*/, size_t /*colEnd*/) const
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
+    {
+        Print(matrixName, 0, GetNumRows()-1, 0, GetNumCols()-1);
+    }
+
+    // file I/O
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ReadFromFile(FILE*, const char * /*matrixName*/)
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::WriteToFile(FILE*, const char * /*matrixName*/)
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    //helpfer function used for convolution neural network 
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPackedConvolutionInput(const GPUMatrix<ElemType>& inputSubBatch, 
+                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                            const bool zeroPadding)
+    {
+        assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth);
+
+        size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
+        size_t packedInputColsPerSample = outputWidth * outputHeight;
+        size_t smallBatchSize = inputSubBatch.GetNumCols();
+        Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
+        if (zeroPadding) 
+            SetValue((ElemType)0);
+
+        PrepareDevice();
+        int numThreadPerBlock = threadsPerBlock; 
+#if 1
+        int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; 
+#else
+        dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize);
+#endif
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignPackedConvolutionInput<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, 
+                                            inputSubBatch.m_pArray, 
+                                            smallBatchSize,
+                                            inputWidth, inputHeight, inputChannels,
+                                            outputWidth, outputHeight, outputChannels,
+                                            kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    //helpfer function used for convolution neural network 
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::UnpackConvolutionInput(GPUMatrix<ElemType>& inputSubBatch, 
+                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                            const bool zeroPadding) const
+    {
+        assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth);
+
+        size_t smallBatchSize = inputSubBatch.GetNumCols();
+
+        PrepareDevice();
+        int numThreadPerBlock = threadsPerBlock; 
+#if 1
+        int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; 
+#else
+        dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize);
+#endif
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _unpackConvolutionInput<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, 
+                                            inputSubBatch.m_pArray, 
+                                            smallBatchSize,
+                                            inputWidth, inputHeight, inputChannels,
+                                            outputWidth, outputHeight, outputChannels,
+                                            kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return inputSubBatch;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignMaxPoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
+
+        unsigned int batchSize = inputBatch.GetNumCols();
+        Resize(outputSizePerSample, batchSize);
+
+        int numThreadPerBlock = threadsPerBlock; 
+        int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignMaxPoolingResult<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, inputBatch.m_pArray, batchSize, channels,
+                                                 inputWidth, inputHeight,inputSizePerSample, 
+                                                 outputWidth, outputHeight, outputSizePerSample, 
+                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddMaxPoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, const GPUMatrix<ElemType>& inputBatch, const GPUMatrix<ElemType>& outputBatch, 
+                                                const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
+
+        unsigned int batchSize = outputGradientBatch.GetNumCols();
+        int numThreadPerBlock = threadsPerBlock; 
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+
+        int blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
+        _addMaxPoolingGradient<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, outputGradientBatch.m_pArray, inputBatch.m_pArray, outputBatch.m_pArray, batchSize, channels,
+                                                 inputWidth, inputHeight,inputSizePerSample, 
+                                                 outputWidth, outputHeight,  outputSizePerSample, 
+                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignAveragePoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
+
+        unsigned int batchSize = inputBatch.GetNumCols();
+        Resize(outputSizePerSample, batchSize);
+
+        int numThreadPerBlock = threadsPerBlock; 
+        int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignAveragePoolingResult<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, inputBatch.m_pArray, batchSize, channels,
+                                                 inputWidth, inputHeight,inputSizePerSample, 
+                                                 outputWidth, outputHeight, outputSizePerSample, 
+                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, 
+                                                const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth);
+
+        size_t batchSize = outputGradientBatch.GetNumCols();
+        int numThreadPerBlock = threadsPerBlock; 
+
+        PrepareDevice();
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+
+        size_t blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; 
+        _addAveragePoolingGradient<<<blocksPerGrid, numThreadPerBlock,0,t_stream>>>(m_pArray, outputGradientBatch.m_pArray, (long)batchSize, channels,
+                                                 inputWidth, inputHeight,inputSizePerSample, 
+                                                 outputWidth, outputHeight,  outputSizePerSample, 
+                                                 windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        return *this;
+    }
+
+#pragma endregion Other helper functions
+
+#pragma region Static BLAS Functions
+    template<class ElemType>
+    void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, 
+        ElemType beta, GPUMatrix<ElemType>& c)
+    {
+        a.PrepareDevice();
+        if ((a.GetComputeDeviceId()!=b.GetComputeDeviceId()) || (b.GetComputeDeviceId()!=c.GetComputeDeviceId())) //different GPUs
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {  
+            cublasHandle_t cuHandle = GetCublasHandle(b.GetComputeDeviceId());
+            cublasOperation_t transA =  transposeA ? CUBLAS_OP_T : CUBLAS_OP_N;
+            cublasOperation_t transB =  transposeB ? CUBLAS_OP_T : CUBLAS_OP_N;
+            int m = int(transposeA ? a.m_numCols : a.m_numRows);
+            int n = int(transposeB ? b.m_numRows : b.m_numCols);
+            int k = int(transposeA ? a.m_numRows : a.m_numCols);
+            int l = int(transposeB ? b.m_numCols : b.m_numRows);
+            c.Resize(m,n);
+
+            if (!(m>0 && k>0 && l>0 && n>0)) 
+            {
+                throw std::runtime_error("!(m>0 && k>0 && l>0 && n>0)");  //converting from size_t to int may cause overflow
+            }
+            if (k!=l) 
+            {
+                throw std::runtime_error("matrix dim mismatch in MultiplyAndWeightedAdd");
+            }
+            if (sizeof(ElemType)==sizeof(float))
+            {
+                CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(c.m_pArray),(int)c.m_numRows));
+            }
+            else if (sizeof(ElemType)==sizeof(double))
+            {            
+                CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(c.m_pArray),(int)c.m_numRows));
+            }
+            else 
+            {
+                throw std::runtime_error("Unsupported template argument in GPUMatrix");             
+            }
+            c.m_numRows=m;
+            c.m_numCols=n;
+        }
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::MultiplyAndAdd(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
+    {
+        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 1, c);
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
+    {    
+        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 0, c);
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        return GPUMatrix<ElemType>::MultiplyAndWeightedAdd(1, a, false, b, false, 0, c);
+    }
+
+    /// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a + c</summary>
+    /// if a is a column vector, add to all columns of c 
+    /// if a is a row vector, add to all rows of c    
+    /// if a is a scalar, add to all elements of c
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {
+        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {
+            a.PrepareDevice();
+            if (a.IsEmpty() || c.IsEmpty())
+                throw std::logic_error("ScaleAndAdd:  one of the input matrices is empty.");
+            //if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector
+            if (a.GetNumRows()==c.GetNumRows() && a.GetNumCols()==c.GetNumCols()) // dimensions match
+            {
+                const int m = (int)a.GetNumRows();
+                const int n = (int)a.GetNumCols();
+                const int len = m * n;
+                const int incx = 1;
+                const int incy = 1;
+
+                assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow
+                assert ((int)c.GetNumRows() == m && (int)c.GetNumCols() == n);
+                if ((int)c.GetNumRows() != m || (int)c.GetNumCols() != n)
+                    throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a.");
+
+                cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+                if (sizeof(ElemType) == sizeof(float))
+                {
+                    CUBLAS_CALL(cublasSaxpy(cuHandle,len,reinterpret_cast <float*>(&alpha),reinterpret_cast <float*>(a.m_pArray),incx,reinterpret_cast <float*>(c.m_pArray) ,incy));                
+                }
+                else if (sizeof(ElemType) == sizeof(double))
+                {   
+                    CUBLAS_CALL(cublasDaxpy(cuHandle,len,reinterpret_cast <double*>(&alpha),reinterpret_cast <double*>(a.m_pArray),incx,reinterpret_cast <double*>(c.m_pArray) ,incy)); 
+                }
+                else 
+                {
+                    throw std::runtime_error("Unsupported template argument in GPUMatrix"); 
+                }
+            }
+            else if (a.GetNumElements() == 1)
+            {
+                LONG64 N=(LONG64)c.GetNumElements();
+                int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+                c.PrepareDevice();
+                cudaEvent_t done = nullptr;
+                if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+                _scaleAndAddScalar<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, N, alpha, a.m_pArray);
+                if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+                if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+                if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+            }
+            else if (a.GetNumCols() == 1) //col vector, add it to all columns
+            {                
+                long m = (long)c.GetNumRows();
+                long n = (long)c.GetNumCols();                
+                if (m != (long)a.GetNumRows())
+                    throw std::invalid_argument("To add column vector, rows should match.");
+
+                cudaEvent_t done = nullptr;
+                int blocksPerGrid = (int)(ceil(1.0*m*n / threadsPerBlock));
+                if (do_sync)    CUDA_CALL(cudaEventCreate(&done));   
+#ifdef VALIDATION
+                printf(">>>> CUDA compute device is %d\n", a.GetComputeDeviceId());
+                printf(">>>> a.m_pArray = %p, c.m_pArray = %p, alpha = %f, m = %ld, n = %ld\n", a.m_pArray,c.m_pArray,alpha,m,n);   
+                for (int i=0; i < 2; i++)
+                {
+                    ElemType buffer[10] = {-1.234f};
+                    cudaError_t error = cudaMemcpy(buffer, !i?a.m_pArray:c.m_pArray, sizeof(buffer), cudaMemcpyKind::cudaMemcpyDeviceToHost);
+                    if (error == cudaError::cudaSuccess)
+                        printf("buffer valid\n"); 
+                }
+#endif
+
+                _matrixVectorColumnWiseAddWithThreadPerElem<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,c.m_pArray,alpha,m,n);
+
+
+                if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+                if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
+                if (do_sync)    CUDA_CALL(cudaEventDestroy(done));                
+            }
+            else  if (a.GetNumRows()==1)  //row vector, add it to all rows
+            {
+                cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+                int m = (int)c.GetNumRows();
+                int n = (int)c.GetNumCols();
+                assert (n == (int)a.GetNumCols());
+                if (n != (int)a.GetNumCols())
+                    throw std::invalid_argument("To add row vector, cols should match.");
+
+                if (sizeof(ElemType) == sizeof(double))
+                {
+                    foreach_row(i,c)
+                    {
+                        CUBLAS_CALL(cublasDaxpy(cuHandle,n,reinterpret_cast <double*>(&alpha),reinterpret_cast <double*>(a.m_pArray),1,reinterpret_cast <double*>(c.m_pArray+i),m));
+                    }                    
+                }
+                else
+                {
+                    foreach_row(i,c)
+                    {
+                        CUBLAS_CALL(cublasSaxpy(cuHandle,n,reinterpret_cast <float*>(&alpha),reinterpret_cast <float*>(a.m_pArray),1,reinterpret_cast <float*>(c.m_pArray+i),m));
+                    }                    
+                }
+            }
+            else
+                throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a.");
+        }
+    }
+
+    /// <summary>c += alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {
+            a.PrepareDevice();
+
+            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
+                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols());
+
+            if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
+                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
+            {
+                throw std::invalid_argument("AddScaledDifference:  a, b, and c must have same dimension.");
+            }
+
+            if (a.IsEmpty())
+                throw std::logic_error("AddScaledDifference:  Input matrix a is empty.");
+
+            cudaEvent_t done = nullptr;
+            LONG64 n=(LONG64)a.GetNumElements();            
+            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    /// <summary> c = alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {
+            a.PrepareDevice();
+
+            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() );
+
+            if (!(a.GetNumRows() == b.GetNumRows()  && a.GetNumCols() == b.GetNumCols()))
+            {
+                throw std::invalid_argument("AssignScaledDifference:  a, b must have same dimension.");
+            }
+
+            if (a.IsEmpty())
+                throw std::logic_error("AssignScaledDifference:  Input matrix a is empty.");
+
+            if (&c != &a && &c != &b)
+                c.Resize(a.GetNumRows(), a.GetNumCols());
+
+            cudaEvent_t done = nullptr;
+            LONG64 n=(LONG64)a.GetNumElements();            
+            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    /// <summary>c += alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">1X1 matrix</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        assert(alpha.GetNumElements() == 1);
+        if (!(alpha.GetNumElements() == 1))
+            throw std::invalid_argument("AddScaledDifference:  alpha must be a 1X1 matrix.");
+
+        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {
+            a.PrepareDevice();
+
+            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
+                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols());
+
+            if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
+                a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
+            {
+                throw std::invalid_argument("AddScaledDifference:  a, b, and c must have same dimension.");
+            }
+
+            if (a.IsEmpty())
+                throw std::logic_error("AddScaledDifference:  Input matrix a is empty.");
+
+            cudaEvent_t done = nullptr;
+            LONG64 n=(LONG64)a.GetNumElements();            
+            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    /// <summary> c = alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        assert(alpha.GetNumElements() == 1);
+        if (!(alpha.GetNumElements() == 1))
+            throw std::invalid_argument("AddScaledDifference:  alpha must be a 1X1 matrix.");
+
+        if (a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else
+        {
+            a.PrepareDevice();
+
+            assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() );
+
+            if (!(a.GetNumRows() == b.GetNumRows()  && a.GetNumCols() == b.GetNumCols()))
+            {
+                throw std::invalid_argument("AssignScaledDifference:  a, b must have same dimension.");
+            }
+
+            if (a.IsEmpty())
+                throw std::logic_error("AssignScaledDifference:  Input matrix a is empty.");
+
+            c.Resize(a.GetNumRows(), a.GetNumCols());
+
+            cudaEvent_t done = nullptr;
+            LONG64 n=(LONG64)a.GetNumElements();            
+            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    //c[ci,cj] += a[ai,aj]
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
+    {
+        if (ai >= a.GetNumRows() || aj >=a.GetNumCols() ||
+            ci >= c.GetNumRows() || cj >=c.GetNumCols())
+            throw std::invalid_argument("AddElementToElement:  index out of range.");
+
+        a.PrepareDevice();
+        cudaEvent_t done = nullptr;
+        int blocksPerGrid=1;  //only one element
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
+        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
+    {   
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            float alph = (float)alpha;            
+            CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(float*)a.m_pArray,1));
+        }
+        else if (sizeof(ElemType)==sizeof(double))
+        {
+            double alph = alpha;
+            CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(double*)a.m_pArray,1));
+        }
+        else 
+        {
+            throw std::runtime_error("Unsupported template argument in GPUMatrix");            
+        }
+    }
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
+    {           
+        if (alpha.GetNumElements()!=1)
+        {
+            throw std::runtime_error("Matrix alpha must be 1x1");
+        }
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),(float*)alpha.m_pArray,(float*)a.m_pArray,1));
+        }
+        else if (sizeof(ElemType)==sizeof(double))
+        {            
+            CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),(double*)alpha.m_pArray,(double*)a.m_pArray,1));
+        }
+        else 
+        {
+            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
+            throw std::runtime_error("Unsupported template argument in GPUMatrix");            
+        }
+        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
+    }
+
+    template<class ElemType> //c = alpha * a
+    void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("Scale:  Input matrix a is empty.");
+
+        c=a;
+        Scale(alpha,c);
+    }
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::InnerProduct (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const bool isColWise)
+    {
+        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId() || b.GetComputeDeviceId()!=c.GetComputeDeviceId()) //different GPUs
+            throw std::invalid_argument("All matrices must be on the same GPU");
+
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("Scale:  one of the input matrices is empty.");
+
+        const int m = (int)a.GetNumRows();
+        const int n = (int)a.GetNumCols();
+        const int k = (int)b.GetNumRows();
+        const int l = (int)b.GetNumCols();
+
+        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
+        assert (m==k && n==l); //converting from size_t to int may cause overflow
+        if (m!=k || n!=l)
+            throw std::invalid_argument("Matrices a and b should have same dimension.");
+
+        if (isColWise)
+            c.Resize(1,n);
+        else
+            c.Resize(m,1);
+
+        if ((isColWise && m == 1) || !isColWise && n == 1)  //in this case it's equivalent to element-wise product
+        {
+            c.AssignElementProductOf(a, b);
+        }
+        else 
+        {
+            cudaEvent_t done = nullptr;  
+            c.PrepareDevice();
+
+            int blocksPerGrid=0;
+            if (isColWise)  //col-wise
+            {
+                c.Resize(1,n);   
+                blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock);                                        
+            }
+            else
+            {
+                c.Resize(m, 1);
+                blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock);                        
+            }       
+
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
+            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }             
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("InnerProductOfMatrices:  one of the input matrices is empty.");
+
+        const int m = (int)a.GetNumRows();
+        const int n = (int)a.GetNumCols();
+        const int k = (int)b.GetNumRows();
+        const int l = (int)b.GetNumCols();
+
+        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
+        assert (m==k && n==l); //converting from size_t to int may cause overflow
+        if (m!=k || n!=l)
+            throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
+
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        if (sizeof(ElemType) == sizeof(double))
+        {
+            double tmp=0;                        
+            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1,&tmp));
+            return ElemType(tmp);
+            //return (ElemType)ddot((int)a.GetNumElements(), reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1);
+        }
+        else
+        {
+            float tmp=0;                        
+            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1,&tmp));
+            return tmp;
+            //return (ElemType)sdot((int)a.GetNumElements(), reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1);
+        }
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("InnerProductOfMatrices:  one of the input matrices is empty.");        
+
+        Resize(1,1);
+
+        const int m = (int)a.GetNumRows();
+        const int n = (int)a.GetNumCols();
+        const int k = (int)b.GetNumRows();
+        const int l = (int)b.GetNumCols();
+
+        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
+        assert (m==k && n==l); //converting from size_t to int may cause overflow
+        if (m!=k || n!=l)
+            throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
+
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
+        if (sizeof(ElemType) == sizeof(double))
+        {   
+            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(a.m_pArray), 1, reinterpret_cast <double*>(b.m_pArray), 1,reinterpret_cast <double*>(m_pArray)));                    
+        }
+        else
+        {   
+            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(a.m_pArray), 1, reinterpret_cast <float*>(b.m_pArray), 1,reinterpret_cast <float*>(m_pArray)));                      
+        }
+        cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
+        return *this;
+    }
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {
+        if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else 
+        {
+            if (a.IsEmpty())
+                throw std::logic_error("ElementWisePower:  The input matrix a is empty.");
+            if (a.GetNumRows()!=c.GetNumRows() || a.GetNumCols()!=c.GetNumCols())
+                throw std::logic_error("ElementWisePower: matrices must be of the same size");
+
+            cudaEvent_t done = nullptr;
+            a.PrepareDevice();
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));            
+            LONG64 N=(LONG64)a.GetNumElements();
+            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+    }
+
+    template<class ElemType>
+    bool GPUMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold /*= 1e-8*/)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            throw std::logic_error("AreEqual: one of the input matrices is empty.");
+
+        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+
+        a.PrepareDevice();
+        long *res = new long[1];
+        res[0]=1;
+        long *d_res = NULL;
+        CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(long)*1));
+        CUDA_CALL(cudaMemcpy(d_res,res,sizeof(long)*1,cudaMemcpyHostToDevice));
+        long N=(long)a.GetNumElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
+        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray,b.m_pArray,N,threshold,d_res);
+        CUDA_CALL(cudaMemcpy(res,d_res,sizeof(long)*1,cudaMemcpyDeviceToHost));
+        if (res[0]!=0)
+            return true;
+        else
+            return false;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Ones(const size_t rows, const size_t cols)
+    {
+        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
+        c.SetValue(1);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Zeros(const size_t rows, const size_t cols)
+    {
+        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
+        //c.SetValue(0);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Eye(const size_t rows)
+    {
+        GPUMatrix<ElemType> c(rows, rows); //will initialize to 0
+        c.SetDiagonalValue(1);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed)
+    {
+        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
+        c.SetUniformRandomValue(low, high, seed);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed)
+    {
+        GPUMatrix<ElemType> c(rows, cols); //will initialize to 0
+        c.SetGaussianRandomValue(mean, sigma, seed);
+        return c;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemType> &Gradients, const GPUMatrix<ElemType> &SmoothedGradients)
+    {                
+        Gradients.PrepareDevice();
+        ElemType* d_res=NULL;
+        CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(ElemType))); //we allocate memory on the device
+
+        //Compute inner product of matrices and keep it on device
+        const int m = (int)Gradients.GetNumRows();
+        const int n = (int)Gradients.GetNumCols();
+        const int k = (int)SmoothedGradients.GetNumRows();
+        const int l = (int)SmoothedGradients.GetNumCols();
+        assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow
+        assert (m==k && n==l); //converting from size_t to int may cause overflow
+        if (m!=k || n!=l) throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension.");
+
+        if (sizeof(ElemType) == sizeof(double))
+        {                 
+            cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId());
+            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
+            CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast <double*>(Gradients.m_pArray), 1, reinterpret_cast <double*>(SmoothedGradients.m_pArray), 1,reinterpret_cast <double*>(d_res)));
+            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
+        }
+        else
+        {            
+            cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId());
+            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE);
+            CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast <float*>(Gradients.m_pArray), 1, reinterpret_cast <float*>(SmoothedGradients.m_pArray), 1,reinterpret_cast <float*>(d_res)));
+            cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST);
+        }
+        // d_res[0] should now contain inner product of matrices
+        // Compute squared Frobenius norms (squared sums of elements)       
+        _lrHelper<ElemType><<<1,512,0,t_stream>>>(Gradients.m_pArray,SmoothedGradients.m_pArray, (LONG64)Gradients.GetNumElements(), d_res);
+        ElemType res;
+        CUDA_CALL(cudaMemcpy(&res,d_res,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaFree(d_res));
+        return res;
+    }
+
+#pragma endregion Static BLAS Functions
+
+
+    template class GPUMatrix<float>; 
+    template class GPUMatrix<double>;
+    template class DeviceBoundNumber<float>;
+    template class DeviceBoundNumber<double>;
+
+    template<class ElemType>
+    cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
+
+    template<class ElemType>
+    void* GPUMatrix<ElemType>::s_curandGenerator=NULL;    
+}}}
+
+// !!!!This is from helper_cuda.h which comes with CUDA samples!!!! Consider if it is beneficial to just include all helper_cuda.h
+// TODO: This is duplicated in BestGpu.cpp
+// Beginning of GPU Architecture definitions
+int _ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+    return nGpuArchCoresPerSM[7].Cores;
+};
+// end of GPU Architecture definitions
+
+//inline long _GetFreeMemoryOnCUDADevice(int devId)
+//{   
+//    CUdevice cudaDevice;  
+//    CUresult result = cuDeviceGet(&cudaDevice, devId);  
+//    if(result!= CUDA_SUCCESS)  
+//    {          
+//        return 0;         
+//    }  
+//  
+//    //create cuda context  
+//    CUcontext cudaContext;    
+//    result = cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice);  
+//    if(result != CUDA_SUCCESS)  
+//    {          
+//        return 0;         
+//    }  
+//  
+//    //get the amount of free memory on the graphics card  
+//    size_t free;  
+//    size_t total;  
+//    result = cuMemGetInfo(&free, &total);  
+//    if (result!=CUDA_SUCCESS)
+//    {
+//        return 0;
+//    }
+//    else
+//        return (long)free;
+//}
+
+#endif // CPUONLY
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index c7e817239..2115bf205 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -1,3266 +1,3405 @@
-//
-// <copyright file="GPUMatrixCUDAKernels.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#include "BestGpu.h"
-
-#ifndef CPUONLY
-
-#include <float.h>
-#include <cuda_runtime.h>
-#include "CommonMatrix.h"
-#include "device_functions.h"
-
-
-#ifndef LONG64  //we would like to use 64-bit long to support large matrices. However, CUDA seems to support only 32-bit long
-#define LONG64  long
-#endif
-
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-#define threadsPerBlock 512
-
-// Predefine this for later.
-static __inline__ __device__ double atomicAdd(double* address, double val);
-//CUDA Kernels code
-template<class ElemType>
-__global__ void _elementWisePowerOnCuda(
-    ElemType alpha,     
-    const ElemType *a, 
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (alpha==0)
-    {
-        c[id]=1;
-    }
-    else if (alpha==1)
-    {
-        c[id]=a[id];
-    }
-    else if (alpha==2)
-    {
-        c[id]=a[id]*a[id];
-    }
-    else if (alpha==3)
-    {
-        c[id]=a[id]*a[id]*a[id];
-    }
-    else
-    {
-        if (sizeof(ElemType)==sizeof(double))
-        {
-            c[id]=pow(a[id],alpha);
-        }
-        else
-        {
-            c[id]=powf(a[id],alpha);
-        }
-    }    
-};
-
-template<class ElemType>
-__global__ void _inplaceSigmoidOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        if (c[id]>=0)
-        {
-            double e = exp(-1*c[id]);
-            c[id]=1/(1+e);
-        }
-        else
-        {
-            double e = exp(c[id]);
-            c[id]=e/(1+e);
-        }
-    }
-    else
-    {
-        if (c[id]>=0)
-        {
-            float e = expf(-1*c[id]);
-            c[id]=1/(1+e);
-        }
-        else
-        {
-            float e = exp(c[id]);
-            c[id]=e/(1+e);
-        }
-    }
-};
-
-template<class ElemType>
-__global__ void _assignSigmoidOf(    
-    const ElemType* a,
-    ElemType* res,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        if (a[id]>=0)
-        {
-            double e = exp(-1*a[id]);
-            res[id]=1/(1+e);
-        }
-        else
-        {
-            double e = exp(a[id]);
-            res[id]=e/(1+e);
-        }
-    }
-    else
-    {
-        if (a[id]>=0)
-        {
-            float e = expf(-1*a[id]);
-            res[id]=1/(1+e);
-        }
-        else
-        {
-            float e = exp(a[id]);
-            res[id]=e/(1+e);
-        }
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceLinRectDerivative(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (c[id]<=0)
-        c[id]=0;
-    else
-        c[id]=1;
-}
-
-template<class ElemType>
-__global__ void _assignSigmoidDerivative( 
-    ElemType *a,
-    ElemType *c,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] = a[id] * (1-a[id]);
-}
-
-template<class ElemType>
-__global__ void _inplaceTanhOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=tanh(c[id]);
-    }
-    else
-    {
-        c[id]=tanhf(c[id]);
-    }
-
-};
-
-//to prevent negative values caused by floating operations, we force inputs to be >=0
-//this may, however, hide problems in the caller.
-template<class ElemType>
-__global__ void _inplaceSqrtOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=sqrt(max((ElemType)0, c[id]));
-    }
-    else
-    {
-        c[id]=sqrtf(max(ElemType(0), c[id]));
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceExpOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=exp(c[id]);
-    }
-    else
-    {
-        c[id]=expf(c[id]);
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceLogOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (c[id]<EPS_IN_LOG)
-    {
-        c[id]=LOG_OF_EPS_IN_LOG;
-    }
-    else
-    {
-        if (sizeof(ElemType)==sizeof(double))
-        {
-            c[id]=log(c[id]);
-        }
-        else
-        {
-            c[id]=logf(c[id]);
-        }
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceAbsOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=fabs(c[id]);
-    }
-    else
-    {
-        c[id]=fabsf(c[id]);
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceCosineOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=cos(c[id]);
-    }
-    else
-    {
-        c[id]=cosf(c[id]);
-    }
-};
-
-template<class ElemType>
-__global__ void _inplaceNegativeSineOnCuda(    
-    ElemType* c,    
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        c[id]=-sin(c[id]);
-    }
-    else
-    {
-        c[id]=-sinf(c[id]);
-    }
-};
-
-
-template<class ElemType>
-__global__ void _setValue(    
-    ElemType* a,
-    const ElemType v,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    a[id]=v;
-};
-
-template<class ElemType>
-__global__ void _setValue(    
-    ElemType* a,
-    const ElemType* d_v,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    a[id]=d_v[0];
-};
-
-template<class ElemType>
-__global__ void _assignRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    long col = id / destRows;
-    long row = id - (col * destRows);
-
-    //dest[id] = src[col*srcRows + row + startIndex];
-    dest[id] = src[IDX2C(row + startIndex, col, srcRows)];
-}
-
-template<class ElemType>
-__global__ void _addToRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    long col = id / srcRows;  //src is the full matrix, rowslice is taken from the dest
-    long row = id - (col * srcRows);
-
-    //dest[col*destRows + row + startIndex] += src[id];
-    dest[IDX2C(row + startIndex, col, destRows)] += src[id];
-}
-
-template<class ElemType>
-__global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= N)
-        return;
-
-    long col = id / destRows;  //dest is the full matrix, rowslice is taken from the src
-    long row = id - (col * destRows);
-
-    dest[id] += src[IDX2C(row + startIndex, col, srcRows)];
-}
-
-template<class ElemType>
-__global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= N)
-        return;
-
-    long destCol = id / destRows;
-    long destRow = id - (destCol * destRows);
-    long srcRow = destRow % srcRows;
-    long srcCol = destCol % srcCols;
-
-    dest[id] = src[IDX2C(srcRow,srcCol,srcRows)];
-}
-
-template<class ElemType>
-__global__ void _assignDifferenceOf1(
-    ElemType* us,
-    const ElemType alpha,
-    const ElemType* a,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    us[id]=alpha-a[id];
-};
-
-template<class ElemType>
-__global__ void _assignDifferenceOf2(
-    ElemType* us,
-    const ElemType alpha,
-    const ElemType* a,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    us[id]=a[id]-alpha;
-};
-
-///a is a scalar
-template<class ElemType>
-__global__ void _scaleAndAddScalar(
-    ElemType* c,
-    const LONG64 N,
-    const ElemType alpha,
-    const ElemType* a
-)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] += alpha*a[0];
-};
-
-template<class ElemType>
-__global__ void _addValue(    
-    ElemType* a,
-    const ElemType v,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    a[id]+=v;
-};
-
-template<class ElemType>
-__global__ void _addValue(    
-    ElemType* a,
-    const ElemType* d_v,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    a[id]+=d_v[0];
-};
-
-
-template<class ElemType>
-__global__ void _elemMul(    
-    ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    a[id]*=b[id];
-};
-
-template<class ElemType>
-__global__ void _assignElementProductOf(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    us[id]=a[id]*b[id];
-}
-
-template<class ElemType>
-__global__ void _assignKhatriRaoProductOf(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType* b,
-    const long rowsA, 
-    const long rowsB, 
-    const long cols)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-
-    const long rows = rowsA * rowsB;
-    const long col = id / rows;
-    if (col >= cols) 
-        return; 
-
-    const long row = id % rows;
-    const long rowB = row / rowsA; 
-    const long rowA = row % rowsA;
-
-    us[id] = a[rowA + col * rowsA] * b[rowB + col * rowsB];
-}
-
-template<class ElemType>
-__global__ void _addColumnReshapeProductOf(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType* b,
-    const long rowsB, 
-    const long rowsC, 
-    const long cols,
-    const bool transposeAColumn)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-
-    const long col = id / rowsC;
-    if (col >= cols) 
-        return; 
-
-    const long row = id % rowsC;
-    long bBase = col * rowsB;
-    long aBase = bBase * rowsC;
-    ElemType v = 0;
-
-    if (transposeAColumn)
-    {
-        aBase += row * rowsB;
-        for (long i=0; i<rowsB; i++)
-        {
-            v += a[aBase++] * b[bBase++];
-        }
-    }
-    else
-    {
-        aBase += row;
-        for (long i=0; i<rowsB; i++)
-        {
-            v += a[aBase] * b[bBase++];
-            aBase += rowsC;
-        }
-    }
-    us[row + col * rowsC] += v;
-}
-
-template<class ElemType>
-__global__ void _assignElementDivisionOf(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    ElemType smallValue = EPS_IN_INVERSE;
-
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    ElemType v = b[id];
-
-    if (v <0 && v > -smallValue)
-        us[id] = a[id]/(-smallValue);
-    else if (v >=0 && v < smallValue)
-        us[id] = a[id]/smallValue;
-    else
-        us[id]=a[id]/v;
-}
-
-template<class ElemType>
-__global__ void _elemInverse(
-    ElemType* us,
-    const LONG64 N)
-{
-    ElemType smallValue = EPS_IN_INVERSE;
-
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    if (us[id] <0 && us[id] > -smallValue)
-        us[id] = 1/-smallValue;
-    else if (us[id] >=0 && us[id] < smallValue)
-        us[id] = 1/smallValue;
-    else
-        us[id]=1/us[id];
-}
-
-template<class ElemType>
-__global__ void _logSoftMaxColWise(
-    ElemType *a,
-    const long m_numCols,
-    const long m_numRows) //ld
-{
-    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (col_id>=m_numCols)
-        return;
-
-    __shared__ ElemType maxV[threadsPerBlock];
-    __shared__ ElemType Sum[threadsPerBlock];
-    maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)];
-    Sum[threadIdx.x]=0;
-
-    for (long i=0;i<m_numRows;++i)
-    {
-        if (a[IDX2C(i,col_id,m_numRows)]>maxV[threadIdx.x])
-        {
-            maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)];
-        }
-    }
-
-    for (long i=0;i<m_numRows;++i)
-    {
-        ElemType tmp = a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x];
-        Sum[threadIdx.x] += (sizeof(ElemType)==sizeof(float) ? expf(tmp) : exp(tmp));
-    }
-    Sum[threadIdx.x] = maxV[threadIdx.x] + (sizeof(ElemType)==sizeof(float)?logf(Sum[threadIdx.x]):log(Sum[threadIdx.x]));
-    for (long i=0;i<m_numRows;++i)
-    {
-        a[IDX2C(i,col_id,m_numRows)] -= Sum[threadIdx.x] ;
-    }
-}
-
-//template<class ElemType>
-//__global__ void _assignColumnwiseSoftmaxOf(
-//    const ElemType *a,
-//    ElemType* us,
-//    const long m_numCols,
-//    const long m_numRows) //thead per column
-//{
-//    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
-//    if (col_id>=m_numCols)
-//        return;
-//
-//    __shared__ ElemType maxV[threadsPerBlock];
-//    __shared__ ElemType Sum[threadsPerBlock];
-//    maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)];
-//    Sum[threadIdx.x]=0;
-//
-//    for (long i=0;i<m_numRows;++i)
-//    {
-//        if (a[IDX2C(i,col_id,m_numRows)]>maxV[threadIdx.x])
-//        {
-//            maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)];
-//        }
-//    }
-//
-//    for (long i=0;i<m_numRows;++i)
-//    {
-//        if (sizeof(ElemType)==sizeof(float))
-//        {
-//            us[IDX2C(i,col_id,m_numRows)] = expf(a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x]);
-//        }
-//        else
-//        {
-//            us[IDX2C(i,col_id,m_numRows)] = exp(a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x]);
-//        }
-//        Sum[threadIdx.x] +=  us[IDX2C(i,col_id,m_numRows)];
-//    }
-//
-//    for (long i=0;i<m_numRows;++i)
-//    {
-//        us[IDX2C(i,col_id,m_numRows)] /= Sum[threadIdx.x] ;
-//    }
-//}
-
-template<class ElemType>
-__global__ void _assignColumnwiseLogSoftmaxOf(
-    const ElemType *a,
-    ElemType* us,
-    const long m_numCols,
-    const long m_numRows) // each block processes one column. There must be 512 threads in a block
-{
-    //we first find max per column
-    __shared__ ElemType colMax[1];
-    __shared__ ElemType partials[512];    
-    colMax[0]=-10000000;
-    partials[threadIdx.x]=-10000000;
-
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    int loadPerThread = m_numRows/blockDim.x; 
-
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x],a[IDX2C(i,blockIdx.x,m_numRows)]);
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+256],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+128],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+64],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+32],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+16],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+8],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partials[threadIdx.x]=max(partials[threadIdx.x+4],partials[threadIdx.x]);
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        colMax[0] = max(max(partials[0],partials[1]),max(partials[2],partials[3]));        
-    }
-    partials[threadIdx.x]=0.0f;
-    __syncthreads();
-    //end of finding max
-    //now start finding sums
-    __shared__ ElemType colSum[1];
-    colSum[0]=0.0f;
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        ElemType tmp=a[IDX2C(i,blockIdx.x,m_numRows)]-colMax[0];
-        us[IDX2C(i,blockIdx.x,m_numRows)]=tmp;
-        partials[threadIdx.x]+=(sizeof(ElemType)==sizeof(float)?expf(tmp):exp(tmp));
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+256];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+128];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+64];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+32];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+16];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+8];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partials[threadIdx.x]+=partials[threadIdx.x+4];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        colSum[0] = partials[0]+partials[1]+partials[2]+partials[3];
-        colSum[0] = (sizeof(ElemType)==sizeof(float)?logf(colSum[0]):log(colSum[0]));
-    }
-    __syncthreads();
-    //end of finding sums
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
-    {        
-        us[IDX2C(i,blockIdx.x,m_numRows)]-=colSum[0];        
-    }
-}
-
-template<class ElemType>
-__global__ void _logSoftMaxRowWise(
-    ElemType *a,
-    const long m_numCols,
-    const long m_numRows) //ld
-{
-    int row_id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (row_id>=m_numRows)
-        return;
-
-    __shared__ ElemType maxV[threadsPerBlock];
-    __shared__ ElemType Sum[threadsPerBlock];
-    maxV[threadIdx.x]=a[IDX2C(row_id,0,m_numRows)];
-    Sum[threadIdx.x]=0;
-
-    for (long j=0;j<m_numCols;++j)
-    {
-        if (a[IDX2C(row_id,j,m_numRows)]>maxV[threadIdx.x])
-        {
-            maxV[threadIdx.x]=a[IDX2C(row_id,j,m_numRows)];
-        }
-    }
-
-    for (long j=0;j<m_numCols;++j)
-    {
-        ElemType tmp = a[IDX2C(row_id,j,m_numRows)]-maxV[threadIdx.x];
-        Sum[threadIdx.x] += sizeof(ElemType)==sizeof(float) ? expf(tmp) : exp(tmp);
-    }
-    Sum[threadIdx.x] = maxV[threadIdx.x]+(sizeof(ElemType)==sizeof(float)?logf(Sum[threadIdx.x]):log(Sum[threadIdx.x]));
-    for (long j=0;j<m_numCols;++j)
-    {
-        a[IDX2C(row_id,j,m_numRows)] -= Sum[threadIdx.x] ;
-    }
-}
-
-template<class ElemType>
-__global__ void _inplaceTruncateBottom(
-    ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]<threshold)
-        a[id]=threshold;
-}
-
-template<class ElemType>
-__global__ void _assignTruncateBottom(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]<threshold)
-        us[id]=threshold;
-    else
-        us[id]=a[id];
-}
-
-template<class ElemType>
-__global__ void _inplaceTruncateTop(
-    ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]>threshold)
-        a[id]=threshold;
-}
-
-template<class ElemType>
-__global__ void _assignTruncateTop(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]>threshold)
-        us[id]=threshold;
-    else
-        us[id]=a[id];
-}
-
-template<class ElemType>
-__global__ void _setToZeroIfAbsLessThan(
-    ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (sizeof(ElemType)==sizeof(float))
-    {
-        if (fabsf(a[id])<threshold)
-            a[id]=0;
-    }
-    else
-    {
-        if (fabs(a[id])<threshold)
-            a[id]=0;
-    }
-}
-
-template<class ElemType>
-__global__ void _areEqual(
-    const ElemType* a,
-    const ElemType* b,
-    const LONG64 N,
-    const ElemType threshold,
-    long *d_res)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    if (sizeof(ElemType)==sizeof(float))
-    {
-        if (fabsf(a[id]-b[id]) > threshold) 
-        {
-            d_res[0]=0;
-        }
-    }
-    else
-    {
-        if (fabs(1.0*a[id]-1.0*b[id]) > threshold) 
-        {
-            d_res[0]=0;
-        }
-    }
-
-}
-
-template<class ElemType>
-__global__ void _setDiagonalValue(
-    ElemType* a,
-    const ElemType v,
-    const unsigned long N,
-    const unsigned long ld)
-{
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;    
-    a[IDX2C(id,id,ld)]=v;
-
-}
-
-template<class ElemType>
-__global__ void _setDiagonalValueFromVector(
-    ElemType* a,
-    const ElemType* b,
-    const long N)
-{
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return; 
-    a[IDX2C(id,id,N)]=b[id];
-}
-
-template<class ElemType>
-__global__ void _adagrad(
-    ElemType* a,
-    ElemType* d_v,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= N)
-        return;
-
-    const ElemType floor = 1e-16f;
-
-    a[id] += d_v[id] * d_v[id];
-    d_v[id] /= sqrt(a[id]+floor);
-}
-
-template<class ElemType>
-__global__ void _rmsprop_init(
-    ElemType* avars, ElemType* signs, ElemType* steps,
-    ElemType* curr_grad,
-    const LONG64 N
-    )
-{
-    LONG64 i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= N)
-        return;
-
-    ElemType tmp = curr_grad[i];
-    avars[i] = tmp * tmp;
-    signs[i] = ElemType(0.0);
-    steps[i] = ElemType(0.02);
-}
-
-template<class ElemType>
-__global__ void _rmsprop(
-    ElemType* avars, ElemType* signs, ElemType* steps,
-    ElemType* curr_grad,
-    const LONG64 N,
-    ElemType RMS_GAMMA,ElemType RMS_WGT_INC,ElemType RMS_WGT_MAX,ElemType RMS_WGT_DEC,ElemType RMS_WGT_MIN,
-    ElemType floor,
-    ElemType *upd_gpu
-    )
-{
-    LONG64 i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= N)
-        return;
-
-    avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0)-RMS_GAMMA)* (curr_grad[i] * curr_grad[i]);
-
-    //// grad sign base 3: 0->neg, 1->zero, 2->pos
-    //const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
-
-    //// signs[i] contains three consecutive grad_sign
-    //signs[i]  = 3*(int(signs[i]) % 9) + grad_sign;
-
-    //// update according to the following table:
-    //// (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
-    //// (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
-    //// otherwise: no action
-
-    //switch(int(upd_gpu[int(signs[i])]))
-    //{
-    //case 0:
-    //    steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
-    //    break;
-    //case 2:
-    //    steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
-    //    break;
-    //}
-    //curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
-
-    const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
-
-    if( signs[i] * grad_sign > 0 )
-        steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
-    else
-        steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
-
-    curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
-    signs[i] = grad_sign;
-
-}
-
-template<class ElemType>
-__global__ void _rescaleToRange(
-    ElemType* a,
-    const LONG64 N,
-    const ElemType low,
-    const ElemType high)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;    
-    a[id]=a[id]*(high-low)+low;
-}
-
-template<class ElemType>
-__global__ void _setMaskAndScale(
-    ElemType* a,
-    const LONG64 N,
-    const ElemType maskRate,
-    const ElemType scaleValue)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;    
-    a[id]=a[id]<=maskRate? 0 : scaleValue;
-}
-
-template<class ElemType>
-__global__ void _vectorNorm1(
-    ElemType* c, //output
-    const ElemType* a, //input
-    const long n, //a.numRows
-    const long m, //a.numCols
-    const bool isColWise)
-{
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-    if ((isColWise && id>=m)||(!isColWise && id>=n))
-        return;
-
-    ElemType sum = 0;
-
-    if (isColWise)
-    {
-        for (long i=0;i<n;++i)
-        {
-            if (sizeof(ElemType)==sizeof(float))
-            {
-                sum+=fabsf(a[IDX2C(i,id,n)]);
-            }
-            else
-            {
-                sum+=fabs(a[IDX2C(i,id,n)]);
-            }
-        }
-    }
-    else
-    {
-        for (long j=0;j<m;++j)
-        {
-            if (sizeof(ElemType)==sizeof(float))
-            {
-                sum+=fabsf(a[IDX2C(id,j,n)]);
-            }
-            else
-            {
-                sum+=fabs(a[IDX2C(id,j,n)]);
-            }
-        }
-    }
-    c[id]=sum;
-}
-
-
-//one column per thread
-template<class ElemType>
-__global__ void _vectorNorm2(
-    ElemType* c,  //output
-    const ElemType* a, //input
-    const long N, //a.GetNumRows();
-    const long M, //a.GetNumCols();
-    const bool isColWise) 
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if ((isColWise && id>=M) || (!isColWise && id>=N))
-        return;
-
-    ElemType sum = 0;
-    if (isColWise)
-    {
-        for (long i=0;i<N;++i)
-        {
-            ElemType v = a[IDX2C(i,id,N)];
-            sum += v * v;
-        }
-    }
-    else
-    {
-        for (long j=0;j<M;++j)
-        {
-            ElemType v = a[IDX2C(id,j,N)];
-            sum += v * v;
-        }
-    }
-
-    if (sizeof(ElemType) == sizeof(float))
-        c[id] = sqrtf(sum);
-    else
-        c[id] = sqrt(sum);
-}
-
-template<class ElemType>
-__global__ void _convertInd2ValsAdjustInd(
-    ElemType* inds,
-    const ElemType* M,
-    ElemType* vals,    
-    const long n, //number of cols
-    const long m, //number of rows
-    const bool isColWise)
-{
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-    if ((isColWise && id>=n)||(!isColWise && id>=m))
-        return;
-    inds[id]--;
-    if (isColWise)
-    {
-        vals[id]=M[IDX2C((int)inds[id],id,m)];
-    }
-    else
-    {
-        vals[id]=M[IDX2C(id,(int)inds[id],m)];
-    }
-}
-
-
-    //assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
-template<class ElemType>
-__global__ void _assignPackedConvolutionInput(ElemType * packedMatrix, const ElemType * inputSubBatch, const long batchSize,
-                                                 const long inputWidth, const long inputHeight, const long inputChannels,
-                                                 const long outputWidth, const long outputHeight, const long outputChannels,
-                                                 const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding)
-{
-    const long inputHeightTimesChannel = inputHeight * inputChannels; 
-    const size_t inputDim = inputWidth*inputHeightTimesChannel;
-
-    const long idall = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = idall / inputDim;
-    if (sample >= batchSize) 
-        return; 
-
-    const long id = idall % inputDim;
-    const long y = id / inputHeightTimesChannel; //inputCol
-
-    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
-    const size_t packedInputColsPerSample = outputWidth * outputHeight;  //output size per channel
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
-    // IN_ELEM_COLPOS = sample
-
-    const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels
-    const long x = nXC / inputChannels; //inputRow
-    const long c = nXC % inputChannels; //channel
-
-    ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; 
-
-    long x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    if (zeroPadding)
-    {
-        const long halfKernelWidth = kernelWidth/2; 
-        const long halfKernelHeight = kernelHeight/2; 
-
-        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
-        x1 = x+halfKernelHeight-x0*verticalSubsample;    //first posxInKernel
-        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
-        y1 = y+halfKernelWidth-y0*horizontalSubsample;  //first posyInKernel
-    }
-    else
-    {
-        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
-        x1 = x-x0*verticalSubsample;    //first posxInKernel
-        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
-        y1 = y-y0*horizontalSubsample;  //first posyInKernel
-    }
-
-    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
-    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow
-
-    long packColBase = sample*packedInputColsPerSample + y0*outputHeight; 
-    for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) 
-    {
-        long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
-        for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) 
-        {
-            const long packRow = packRowBase + posxInKernel; 
-            const long packCol = packColBase + wrow; 
-            packedMatrix[packRow + packCol*packedInputRows] = currentInputValue; 
-        }
-        packColBase += outputHeight; 
-    }
-}
-
-    //assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
-template<class ElemType>
-__global__ void _unpackConvolutionInput(const ElemType * packedMatrix, ElemType * inputSubBatch, const long batchSize,
-                                                 const long inputWidth, const long inputHeight, const long inputChannels,
-                                                 const long outputWidth, const long outputHeight, const long outputChannels,
-                                                 const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding)
-{
-    const long inputHeightTimesChannel = inputHeight * inputChannels; 
-    const size_t inputDim = inputWidth*inputHeightTimesChannel;
-
-    const long idall = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = idall / inputDim;
-    if (sample >= batchSize) 
-        return; 
-
-    const long id = idall % inputDim;
-    const long y = id / inputHeightTimesChannel; //inputCol
-
-    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
-    const size_t packedInputColsPerSample = outputWidth * outputHeight;  //output size per channel
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
-    // IN_ELEM_COLPOS = sample
-
-    const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels
-    const long x = nXC / inputChannels; //inputRow
-    const long c = nXC % inputChannels; //channel
-
-    long x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    if (zeroPadding)
-    {
-        const long halfKernelWidth = kernelWidth/2; 
-        const long halfKernelHeight = kernelHeight/2; 
-
-        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
-        x1 = x+halfKernelHeight-x0*verticalSubsample;    //first posxInKernel
-        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
-        y1 = y+halfKernelWidth-y0*horizontalSubsample;  //first posyInKernel
-    }
-    else
-    {
-        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
-        x1 = x-x0*verticalSubsample;    //first posxInKernel
-        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
-        y1 = y-y0*horizontalSubsample;  //first posyInKernel
-    }
-
-    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
-    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow
-
-    ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; 
-    long packColBase = sample*packedInputColsPerSample + y0*outputHeight; 
-    for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) 
-    {
-        long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
-        for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) 
-        {
-            const long packRow = packRowBase + posxInKernel; 
-            const long packCol = packColBase + wrow; 
-            currentInputValue += packedMatrix[packRow + packCol*packedInputRows]; 
-        }
-        packColBase += outputHeight; 
-    }
-
-    inputSubBatch[id + sample*inputDim] = currentInputValue; 
-}
-
-template<class ElemType>
-__global__ void _assignMaxPoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels,
-                                                const long inputWidth, const long inputHeight,  const long inputSizePerSample, 
-                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
-                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
-{
-    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = outputIndex / outputSizePerSample; 
-    if (sample >= batchSize) 
-        return; 
-
-    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
-    const long inputHeightTimesChannel = inputHeight * channels; 
-    const long outputHeightTimesChannel = outputHeight * channels; 
-
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
-    // IN_ELEM_COLPOS = sample
-
-    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
-    // OUT_ELEM_COLPOS = sample
-
-    const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol
-    const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels
-    const long x = nXC / channels; //wrow
-    const long c = nXC % channels; //channel
-
-    const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample;
-    register ElemType maxVal = -FLT_MAX; 
-    const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c;
-    for (long colInWindow=0; colInWindow<windowWidth; colInWindow++) 
-    {   
-        long rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
-        for (long rowInWindow=0; rowInWindow<windowHeight; rowInWindow++)
-        {
-            const ElemType val = inputBatchBase4Sample[rowInInput]; 
-            maxVal = max(maxVal, val); 
-            rowInInput += channels;
-        }
-    }
-    outputBatch[outputIndexWithinSample + sample*outputSizePerSample] = maxVal; 
-}
-
-template<class ElemType>
-__global__ void _addMaxPoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, 
-                                                const long batchSize, const long channels, 
-                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
-                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
-                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
-{
-    const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = inputIndex / inputSizePerSample; 
-    if (sample >= batchSize) 
-        return; 
-   
-    const long inputIndexWithinSample = inputIndex % inputSizePerSample; 
-
-    const long inputHeightTimesChannel = inputHeight * channels; 
-    const long outputHeightTimesChannel = outputHeight * channels; 
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
-    // IN_ELEM_COLPOS = sample
-
-    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
-    // OUT_ELEM_COLPOS = sample
-
-    const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input
-    const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels
-    const long x = nXC / channels; //row in input
-    const long c = nXC % channels; //channel
-
-    long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample));  //inclusive start
-    long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end
-    long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample));  //inclusive start
-    long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end
-
-
-    ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample;
-    const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample;
-    const ElemType * outputBatchBase4Sample = outputBatch + sample*outputSizePerSample;
-
-    ElemType inputValue = inputBatch[inputIndexWithinSample + sample*inputSizePerSample];
-    for (long outY=startOutY; outY<=endOutY; outY++)
-    {
-        for (long outX=startOutX; outX<=endOutX; outX++)
-        {
-            long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; 
-            if (inputValue == outputBatchBase4Sample[outputIndex])
-                inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex];
-        }
-    }  
-}
-template<class ElemType>
-__global__ void _assignAveragePoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels,
-                                                const long inputWidth, const long inputHeight,  const long inputSizePerSample, 
-                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
-                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
-{
-    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = outputIndex / outputSizePerSample; 
-    if (sample >= batchSize) 
-        return; 
-
-    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
-    const long inputHeightTimesChannel = inputHeight * channels; 
-    const long outputHeightTimesChannel = outputHeight * channels; 
-
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
-    // IN_ELEM_COLPOS = sample
-
-    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
-    // OUT_ELEM_COLPOS = sample
-
-    const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol
-    const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels
-    const long x = nXC / channels; //wrow
-    const long c = nXC % channels; //channel
-
-    const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample;
-
-    register ElemType average = 0; 
-    const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c;
-    for (long colInWindow=0; colInWindow<windowWidth; colInWindow++) 
-    {   
-        long rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
-        for (long rowInWindow=0; rowInWindow<windowHeight; rowInWindow++)
-        {
-            average += inputBatchBase4Sample[rowInInput]; 
-            rowInInput += channels;
-        }
-    }
-
-    outputBatch[outputIndexWithinSample + sample*outputSizePerSample] = average/windowWidth/windowHeight; 
-}
-
-template<class ElemType>
-__global__ void _addAveragePoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, 
-                                                const long batchSize, const long channels, 
-                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
-                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
-                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
-{
-    const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = inputIndex / inputSizePerSample; 
-    if (sample >= batchSize) 
-        return; 
-   
-    const long inputIndexWithinSample = inputIndex % inputSizePerSample; 
-
-    const long inputHeightTimesChannel = inputHeight * channels; 
-    const long outputHeightTimesChannel = outputHeight * channels; 
-    const long windowSize = windowWidth * windowHeight;
-
-    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
-    // IN_ELEM_COLPOS = sample
-
-    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
-    // OUT_ELEM_COLPOS = sample
-
-    const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input
-    const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels
-    const long x = nXC / channels; //row in input
-    const long c = nXC % channels; //channel
-
-    long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample));  //inclusive start
-    long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end
-    long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample));  //inclusive start
-    long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end
-
-    ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample;
-    const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample;
-
-    for (long outY=startOutY; outY<=endOutY; outY++)
-    {
-        for (long outX=startOutX; outX<=endOutX; outX++)
-        {
-            long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; 
-            inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex]/windowSize;
-        }
-    }  
-}
-
-template<class ElemType>
-__global__ void _addMaxPoolingGradientLoopOut(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, 
-                                                const long batchSize, const long channels, 
-                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
-                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
-                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
-{
-    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
-    const long sample = outputIndex / outputSizePerSample; 
-    if (sample >= batchSize) 
-        return; 
-   
-    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
-    const long inputWidthTimesChannel = inputWidth * channels; 
-    const long outputWidthTimesChannel = outputWidth * channels; 
-    const long y = outputIndexWithinSample / outputWidthTimesChannel; 
-    const long nXC = outputIndexWithinSample % outputWidthTimesChannel; 
-    const long x = nXC / channels; 
-    const long c = nXC % channels; 
-
-    const long offset0 = sample*inputSizePerSample + y*verticalSubsample*inputWidthTimesChannel + x*horizontalSubsample*channels;
-    const ElemType *pCurWindow4Input = inputBatch + offset0; // pooling to current window's first input pixel 
-    ElemType *pCurWindow4InGradient = inputGradientBatch + offset0; 
-    for (long yy=0; yy<windowHeight; yy++) 
-    {
-        const long offset1 = yy*inputWidthTimesChannel + c; 
-        const ElemType *pf0 = pCurWindow4Input + offset1; 
-        ElemType *pf1 = pCurWindow4InGradient + offset1; 
-        for (long xx=0; xx<windowWidth; xx++)
-        {
-            const long offset2 = xx*channels; 
-            if (pf0[offset2] == outputBatch[outputIndex]) 
-            {
-                pf1[offset2] += outputGradientBatch[outputIndex]; //need to be atomic however atomicAdd on double is not supported.
-            }
-        }
-    }
-}
-
-template<class ElemType>
-__global__ void _addElementProductOf(
-    ElemType* us,
-    const ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    us[id]+=(a[id]*b[id]);
-}
-
-template<class ElemType>
-__global__ void _columnElementMultiplyWith(
-    ElemType* us,
-    const ElemType* a,
-    const long N, //a.GetNumRows();
-    const long M) //us.GetNumCols();
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    //__shared__ ElemType _a[threadsPerBlock];
-    //_a[threadIdx.x]=a[id];
-    ElemType mul=a[id];
-    for (long j=0;j<M;++j)
-    {
-        us[IDX2C(id,j,N)]=us[IDX2C(id,j,N)]*mul;
-    }
-}
-
-template<class ElemType>
-__global__ void _rowElementMultiplyWith(
-    ElemType* us,
-    const ElemType* a,
-    const long N, //us.GetNumRows();
-    const long M) //a.GetNumCols();
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=M)
-        return;
-
-    //__shared__ ElemType _a[threadsPerBlock];
-    //_a[threadIdx.x]=a[id];
-    ElemType mul=a[id];
-    for (long i=0;i<N;++i)
-    {
-        us[IDX2C(i,id,N)]=us[IDX2C(i,id,N)]*mul;
-    }
-}
-
-template<class ElemType>
-__global__ void _rowElementDivideBy(
-    ElemType* us,
-    const ElemType* a,
-    const long N, //us.GetNumRows();
-    const long M) //a.GetNumCols();
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= M)
-        return;
-
-    //__shared__ ElemType _a[threadsPerBlock];
-    //_a[threadIdx.x]=a[id];
-    ElemType v = a[id];
-    if (v >= 0 && v < EPS_IN_INVERSE)
-        v = EPS_IN_INVERSE;
-    else if (v < 0 && v > -EPS_IN_INVERSE)
-        v = (-EPS_IN_INVERSE);
-
-    for (long i = 0; i<N; ++i)
-    {
-        us[IDX2C(i, id, N)] = us[IDX2C(i, id, N)] / v;
-    }
-}
-
-template<class ElemType>
-__global__ void _ColumnElementDivideBy(
-    ElemType* us,
-    const ElemType* a,
-    const long N, //a.GetNumRows();
-    const long M) //us.GetNumCols();
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-
-    ElemType smallValue = EPS_IN_INVERSE;
-
-    //__shared__ ElemType _a[threadsPerBlock];
-    //_a[threadIdx.x]=a[id];
-    ElemType v=a[id];
-    for (long j=0;j<M;++j)
-    {
-        if (v <0 && v > -smallValue)
-            us[IDX2C(id,j,N)] /= (-smallValue);
-        else if (v >=0 && v < smallValue)
-            us[IDX2C(id,j,N)] /= smallValue;
-        else
-            us[IDX2C(id,j,N)] /= v;
-    }
-
-}
-
-
-template<class ElemType>
-__global__ void _innerProduct(
-    ElemType* c,
-    const ElemType* a,
-    const ElemType* b,
-    const long N, //a.GetNumRows();
-    const long M, //a.GetNumCols();
-    const bool isColWise) 
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if ((isColWise && id>=M) || (!isColWise && id>=N))
-        return;
-
-    ElemType sum = 0;
-    long index;
-    if (isColWise)
-    {
-        for (long i=0; i<N; ++i)
-        {
-            index = IDX2C(i,id,N);
-            sum += a[index]* b[index];
-        }
-    }
-    else
-    {
-        for (long j=0; j<M; ++j)
-        {
-            index = IDX2C(id,j, N);
-            sum += a[index]* b[index];
-        }
-    }
-
-    c[id] = sum;
-}
-
-
-template<class ElemType>
-__global__ void _assignSignOf(
-    ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    ElemType v = b[id];
-    a[id] = (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1)));
-}
-
-template<class ElemType>
-__global__ void _addSignOf(
-    ElemType* a,
-    const ElemType* b,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    ElemType v = b[id];
-    a[id] += (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1)));
-}
-
-template<class ElemType>
-__global__ void _vectorMaxMinReduce( //this function processes 1 column per block. this function needs 512 threads
-                                 const ElemType* us,
-                                 ElemType* Indexes,
-                                 ElemType* Values,
-                                 const long m,  //number of rows
-                                 const long n,
-                                 bool isMax)  //number of cols
-{
-    //we first find max per column    
-    __shared__ ElemType partials[512];        
-    __shared__ int partialsInd[512];
-    if (isMax)
-    {
-        partials[threadIdx.x]=-10000000;
-    }
-    else
-    {
-        partials[threadIdx.x]=10000000;
-    }
-    partialsInd[threadIdx.x]=-1;
-
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    int loadPerThread = m/blockDim.x; 
-
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        if (( isMax ? us[IDX2C(i,blockIdx.x,m)]>partials[threadIdx.x] : us[IDX2C(i,blockIdx.x,m)]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=us[IDX2C(i,blockIdx.x,m)];
-            partialsInd[threadIdx.x]=i;       
-        }
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+256],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+256]>partials[threadIdx.x] : partials[threadIdx.x+256]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+256];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+256];
-        }
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+128],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+128]>partials[threadIdx.x] : partials[threadIdx.x+128]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+128];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+128];
-        }
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+64],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+64]>partials[threadIdx.x] : partials[threadIdx.x+64]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+64];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+64];
-        }
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+32],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+32]>partials[threadIdx.x] : partials[threadIdx.x+32]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+32];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+32];
-        }
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+16],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+16]>partials[threadIdx.x] : partials[threadIdx.x+16]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+16];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+16];
-        }
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+8],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+8]>partials[threadIdx.x] : partials[threadIdx.x+8]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+8];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+8];
-        }
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        //partials[threadIdx.x]=max(partials[threadIdx.x+4],partials[threadIdx.x]);
-        if ((isMax ? partials[threadIdx.x+4]>partials[threadIdx.x] : partials[threadIdx.x+4]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
-        {
-            partials[threadIdx.x]=partials[threadIdx.x+4];
-            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+4];
-        }
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        ElemType mx = partials[0];
-        int ind = partialsInd[0];
-        if ((isMax ? mx<partials[1] : mx>partials[1]) || ind ==-1)
-        {
-            mx = partials[1];
-            ind = partialsInd[1];
-        }
-        if ((isMax ? mx<partials[2] : mx>partials[2]) || ind ==-1)
-        {
-            mx = partials[2];
-            ind = partialsInd[2];
-        }
-        if ((isMax ? mx<partials[3] : mx>partials[3]) || ind ==-1)
-        {
-            mx = partials[3];
-            ind = partialsInd[3];
-        }
-        Values[blockIdx.x] = mx;
-        Indexes[blockIdx.x] = ind;
-    }
-}
-
-template<class ElemType>
-__global__ void _vectorMax(
-    const ElemType* us,
-    ElemType* maxIndexes,
-    ElemType* maxValues,
-    const long m,  //number of rows
-    const long n,  //number of cols
-    const bool isColWise) 
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    long maxInd = -1;
-    ElemType maxVal = -100000;
-
-    if (isColWise)
-    {
-        if (id>=n)
-            return;
-
-        for (long i=0;i<m;i++)
-        {
-            if (maxInd==-1 || us[IDX2C(i,id,m)]>=maxVal)
-            {
-                maxInd = i;
-                maxVal = us[IDX2C(i,id,m)];
-            }
-        }
-    }
-    else
-    {
-        if (id>=m)
-            return;
-
-        for (long j=0;j<n;j++)
-        {
-            if (maxInd==-1 || us[IDX2C(id,j,m)]>=maxVal)
-            {
-                maxInd = j;
-                maxVal = us[IDX2C(id,j,m)];
-            }
-        }
-    }
-    maxIndexes[id]=maxInd;
-    maxValues[id]=maxVal;
-}
-
-template<class ElemType>
-__global__ void _vectorMin(
-    const ElemType* us,
-    ElemType* minIndexes,
-    ElemType* minValues,
-    const long m,  //number of rows
-    const long n,  //number of cols
-    const bool isColWise) 
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    long minInd = -1;
-    ElemType minVal = -100000;
-
-    if (isColWise)
-    {
-        if (id>=n)
-            return;
-
-        for (long i=0;i<m;i++)
-        {
-            if (minInd==-1 || us[IDX2C(i,id,m)]<=minVal)
-            {
-                minInd = i;
-                minVal = us[IDX2C(i,id,m)];
-            }
-        }
-    }
-    else
-    {
-        if (id>=m)
-            return;
-
-        for (long j=0;j<n;j++)
-        {
-            if (minInd==-1 || us[IDX2C(id,j,m)]<=minVal)
-            {
-                minInd = j;
-                minVal = us[IDX2C(id,j,m)];
-            }
-        }
-    }
-    minIndexes[id]=minInd;
-    minValues[id]=minVal;
-}
-
-//this implementation uses more threads but also more memory access
-template<class ElemType>
-__global__ void _matrixVectorColumnWiseAddWithThreadPerElem(
-    const ElemType* a,
-    ElemType* us,
-    ElemType alpha,
-    const long m,  //number of rows
-    const long n)  //number of cols     
-{
-    long id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= m*n)
-        return;
-
-    long col = id / m;
-    long row = id - col*m;
-
-    us[id] += alpha*a[row];
-}
-
-template<class ElemType>
-__global__ void _matrixVectorColumnWiseAddWithThreadPerRow(
-    const ElemType* a,
-    ElemType* us,
-    ElemType alpha,
-    const long m,  //number of rows
-    const long n)  //number of cols     
-{
-#ifdef VALIDATION
-    if (blockDim.x * blockIdx.x + threadIdx.x == 0)
-    {
-        printf("** _matrixVectorColumnWiseAdd on device:\na = %p, us = %p, alpha = %f, m = %ld, n = %ld\n", 
-            a,us,alpha,m,n);
-        printf("us[0] = %f\n", us[0]);
-        printf("a[0] = %f\n", a[0]);
-    }
-#endif
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=m)
-        return;
-    ElemType tmp = a[id];
-#ifdef VALIDATION
-    printf("  a[%d] = %f\n", id, tmp);
-#endif
-    for (long j = 0; j < n; ++j )
-    {
-        us[j*m+id] += alpha*tmp;
-    }
- 
-}
-
-
-template<class ElemType>
-__global__ void _matrixVectorColumnWiseAddBlockPerRow(
-    const ElemType* a,
-    ElemType* us,
-    ElemType alpha,
-    const long m,  //number of rows
-    const long n)  //number of cols     
-{    
-    ElemType tmp;
-
-    if (threadIdx.x==0)
-    {
-        tmp = a[blockIdx.x];
-    }
-    __syncthreads();
-
-    int loadPerThread = n/blockDim.x; 
-
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? n : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        us[m*blockIdx.x + i] += alpha*tmp;
-    }
-}
-
-
-
-template<class ElemType>
-__global__ void _addScaledDifference( 
-    ElemType alpha,
-    ElemType *a,
-    ElemType *b,
-    ElemType *c,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] = c[id] + (a[id]-b[id]) * (alpha);
-}
-
-template<class ElemType>
-__global__ void _assignScaledDifference( 
-    ElemType alpha,
-    ElemType *a,
-    ElemType *b,
-    ElemType *c,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] = (a[id]-b[id]) * (alpha);
-}
-
-template<class ElemType>
-__global__ void _addScaledDifference( 
-    ElemType *alpha,
-    ElemType *a,
-    ElemType *b,
-    ElemType *c,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] = c[id] + (a[id]-b[id]) * alpha[0];
-}
-
-template<class ElemType>
-__global__ void _assignScaledDifference( 
-    ElemType *alpha,
-    ElemType *a,
-    ElemType *b,
-    ElemType *c,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    c[id] = (a[id]-b[id]) * alpha[0];
-}
-
-template<class ElemType>
-__global__ void _addElementToElement( 
-    const ElemType *a, LONG64 indexA,
-    ElemType *c, LONG64 indexC)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>0)
-        return;
-    c[indexC] += a[indexA];
-}
-
-template<class ElemType>
-__global__ void _assignNumOfDiff( 
-    const ElemType *a,
-    const ElemType *b,
-    ElemType *c,
-    LONG64 N)
-{
-    __shared__ ElemType partialSums[1024];
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    LONG64 loadPerThread = N/blockDim.x; 
-    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        partialSums[threadIdx.x]+=(a[i] != b[i]);
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        c[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
-    }
-}
-
-
-/*template<class ElemType>
-__global__ void _assignNumOfDiff( 
-ElemType *a,
-ElemType *b,
-ElemType *c,
-long N)
-{
-//TO DO: replace atomic operation with reduction
-
-__shared__ int totalSum;
-if (threadIdx.x == 0) totalSum = 0;
-__syncthreads();
-
-int id = blockDim.x * blockIdx.x + threadIdx.x;
-if (id>=N)
-return;
-
-int localVal = (a[id] != b[id]);
-atomicAdd(&totalSum, localVal);
-__syncthreads();
-
-c[id] = totalSum;
-}*/
-
-template<class ElemType>
-__global__ void _scaleArray(
-    ElemType alpha,
-    ElemType *us,
-    LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    us[id]=us[id]*alpha;
-}
-
-
-template<class ElemType>
-__global__ void _sparseCSRPlusDense(
-    ElemType alpha,
-    const ElemType* m_dVal,
-    const int* m_dRow,
-    const int* m_dCol,
-    ElemType* pArrayDev,
-    LONG64 M)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=M)
-        return;
-    int start = m_dRow[id];
-    int end = m_dRow[id+1];
-    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
-    {
-        int j = m_dCol[_i];
-        pArrayDev[IDX2C(id,j,M)]+=(alpha*m_dVal[_i]);
-    }
-}
-
-template<class ElemType>
-__global__ void _sparseCSRElemMulDense(    
-    const ElemType* m_dVal,
-    const int* m_dRow,
-    const int* m_dCol,
-    const ElemType* b,
-    ElemType* c,
-    LONG64 M)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=M)
-        return;
-    int start = m_dRow[id];
-    int end = m_dRow[id+1];
-    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
-    {
-        int j = m_dCol[_i];
-        c[IDX2C(id,j,M)]=b[IDX2C(id,j,M)]*m_dVal[_i];
-    }
-}
-
-// forward pass from feature to hidden layer
-/*template<class ElemType>
-__global__ void _denseMulSparseCSCToDense(
-    ElemType alpha,
-    const ElemType* lhs,
-    int numrows,
-    int numcols,
-    const GPUSPARSE_INDEX_TYPE* row,
-    ElemType* c)
-{
-    int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
-    int tStart = loadPerThread * threadIdx.x;
-    int tEnd = min(numrows, loadPerThread + tStart);
-
-    int p = blockIdx.x;
-    int i = row[p];
-    int j = blockIdx.x;
-
-    for (int h = tStart; h < tEnd; h++) 
-    {
-        ElemType res = alpha * lhs[IDX2C(h, i, numrows)]; 
-        atomicAdd(&c[IDX2C(h,j,numrows)], res);
-    }
-}*/
-
-//c = alpha * op(a) * op(b) + beta*c
-//this function can be further improved by using shared memory
-template<class ElemType>
-__global__ void _denseMultSparseCSCAndWeightedAddToDense(
-    int m, //rowDense
-    int k,  //colDense = rowSparse
-    int n,   //colSparse
-    ElemType alpha,
-    const ElemType* a,  //dense
-    const ElemType* bnzValues,  //sparse nz values
-    const GPUSPARSE_INDEX_TYPE* rowIndex,
-    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
-    ElemType beta,
-    ElemType* c  //dense target
-    )
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= m*n)  
-        return;
-
-    int colInC = id / m;
-    int rowInC = id - colInC * m;
-
-    int start = colCSCIndex[colInC]; 
-    int end = colCSCIndex[colInC + 1];
-
-    ElemType s = 0;
-   for (int j = start; j<end; j++)  //j points to the value
-    {
-        int i = rowIndex[j];
-        s += a[IDX2C(rowInC, i, m)] * bnzValues[j];
-    }
-    c[IDX2C(rowInC, colInC, m)] = alpha * s + beta * c[IDX2C(rowInC, colInC, m)];
-}
-
-// backward pass from hidden layer to feature weight
-template<class ElemType>
-__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(    
-	ElemType alpha,
-    ElemType* lhs,
-    size_t nrs,
-	ElemType* rhsNZValues,
-    const GPUSPARSE_INDEX_TYPE* row,
-    const size_t* rowIdx,
-    ElemType* blockVal,
-    size_t* blockIds)
-{
-    int p = blockIdx.x;
-    int i = row[p];
-    int ii = rowIdx[p];
-    int j = blockIdx.x;
-
-    int load = (nrs+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)nrs, load + pStart);
-
-    for(int h = pStart; h < pEnd; h++) 
-    {        
-        ElemType temp = alpha*lhs[IDX2C(h, j, nrs)]*rhsNZValues[p];    
-        atomicAdd(&blockVal[ii*nrs+h], temp);
-        blockIds[ii] = i;
-    }
-}
-
-// gradients update
-template<class ElemType>
-__global__ void _scaleSparseAndAddToDense(    
-    ElemType alpha,
-    bool blockCol,
-    ElemType* blockVal,
-    size_t* blockIds,
-    size_t len,
-    ElemType* rhs,
-    size_t numrows)
-{
-    int ii = blockIdx.x;
-    int i = blockIds[ii];
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)len, load + pStart);
-
-    for(int h = pStart; h < pEnd; h++) 
-    {   ElemType temp = alpha*blockVal[ii*len + h];
-        if(blockCol)
-        {
-            atomicAdd(&rhs[IDX2C(h, i, numrows)], temp);
-        }
-        else
-        {
-            atomicAdd(&rhs[IDX2C(i, h, numrows)], temp);
-        }
-    }
-}
-
-// compute predictions in cross entory node
-template<class ElemType>
-__global__ void _computePrediction(
-    int nv,
-    const ElemType* a,
-    int numrows,
-    const ElemType* weight,   
-    int nrs,
-    int labelSize,
-    const GPUSPARSE_INDEX_TYPE* labelRow,
-    const size_t* block2Id,
-    const ElemType* cls,
-    const ElemType* idx2cls,    
-    ElemType* val,
-    GPUSPARSE_INDEX_TYPE* row,
-    GPUSPARSE_INDEX_TYPE* pb)
-{
-    // get label block id
-    int id = -1;
-    int offset = -1;
-    for(int i = 1; i < labelSize; i++) 
-    {
-        if (blockIdx.x < block2Id[i]) 
-        {
-            id = i-1;
-            offset = blockIdx.x - block2Id[i-1];
-            break;
-        }
-    }
-    if( id == -1) 
-    {
-        id = labelSize-1;
-        offset = blockIdx.x - block2Id[labelSize-1];
-    }
-
-    int t = labelRow[id];
-    int iStt;
-    int iEnd;
-    if(t < nv) 
-    {
-        int clsid = idx2cls[t];
-        iStt = cls[IDX2C(0, clsid, 2)];
-        iEnd = cls[IDX2C(1, clsid, 2)];
-    } 
-    else 
-    {
-        iStt = nv;
-        iEnd = nrs;
-    }
-    int i = iStt + offset;
-    int j = id /2;
-    
-    int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
-    int tStart = loadPerThread * threadIdx.x;
-    int tEnd = min((int)numrows, loadPerThread + tStart);
-
-    ElemType v = 0.0;
-    for (int h = tStart; h < tEnd; h++)
-    {
-        v += weight[IDX2C(i,h,nrs)] * a[IDX2C(h,j,numrows)]; 
-    }
-    atomicAdd(&val[blockIdx.x], v);
-    row[blockIdx.x] = i;
-
-    if(blockIdx.x == 0 && threadIdx.x == 0) 
-        pb[0] = 0;
-    
-    if((threadIdx.x == 0) && (i == iEnd-1) && (i >= nv)) 
-        pb[j+1] = blockIdx.x+1;
-}
-
-// normalize predictions in cross entropy node
-template<class ElemType>
-__global__ void _normalizePrediction(
-    const size_t labelSize,
-    const size_t expandedLabelSize,
-    const GPUSPARSE_INDEX_TYPE* labelRow,
-    const size_t* block2Id,    
-    const GPUSPARSE_INDEX_TYPE* row,
-    ElemType* val,
-    ElemType* entropyScore)
-{    
-    __shared__ ElemType partials[512];
-    partials[threadIdx.x] = 0;
-
-    int p = blockIdx.x;
-    int t = labelRow[p];
-    int start = block2Id[p];
-    int end;
-    if(p == labelSize -1) 
-    {
-        end = expandedLabelSize;
-    } 
-    else 
-    {
-        end = block2Id[p+1];
-    }
-    int len = end - start;
-
-    int loadPerThread = (len+blockDim.x-1)/blockDim.x;
-    int tStart = loadPerThread * threadIdx.x;
-    int tLen = min((int)len, loadPerThread + tStart);
-
-    for(int i = start + tStart; i < start + tLen; i++) 
-    {
-        partials[threadIdx.x] += exp(val[i]);
-    }
-
-    __syncthreads();
-
-    // now sum up the objective function
-    int nTotalThreads = blockDim.x;
-
-    while (nTotalThreads >1)
-    {
-        int halfPoint = (nTotalThreads >> 1);
-
-        if (threadIdx.x < halfPoint)
-            partials[threadIdx.x] += partials[threadIdx.x+halfPoint];
-
-        __syncthreads();
-
-        nTotalThreads = (nTotalThreads>>1);
-    }
-    
-    for(int i = start + tStart; i < start + tLen; i++) 
-    {
-        val[i] = log(exp(val[i])/partials[0]);
-        if(row[i] == t) 
-        {
-            atomicAdd(entropyScore, -val[i]);
-            val[i] *= -1;
-        }
-    }
-}
-
-// compute prediction error in cross entropy node
-template<class ElemType>
-__global__ void _computePredictionError(
-    ElemType* val,
-    int N)
-{    
-    int p = blockDim.x * blockIdx.x + threadIdx.x;
-    if (p>=N)
-        return;
-
-    if(val[p] < 0) 
-        val[p] = exp(val[p]); //negative;
-    else 
-        val[p] = exp(-val[p])-1; //positive
-}
-
-// compute gradients of input in cross entropy node
-template<class ElemType>
-__global__ void _computeGradientOfInput(
-    const ElemType* val,
-    const GPUSPARSE_INDEX_TYPE* row,
-    const GPUSPARSE_INDEX_TYPE* pb,    
-    ElemType* weight,
-    size_t nrs,
-    ElemType* grd,
-    size_t numrows)
-{        
-    int h = blockIdx.x%numrows;
-    int j = blockIdx.x/numrows;
-
-    int start = pb[j];
-    int end = pb[j+1];
-    int len = end - start;
-    
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = start + load * threadIdx.x;
-    int pEnd = start + min(len, load * (threadIdx.x+1));
-
-    ElemType sum = 0;
-    for(int p = pStart; p < pEnd; p++) 
-    {
-        int i = row[p];
-        sum += val[p] * weight[IDX2C(i, h, nrs)]; 
-    }    
-
-    atomicAdd(&grd[IDX2C(h,j,numrows)], sum);
-}
-
-// compute gradients of weights in cross entropy node
-template<class ElemType>
-__global__ void _computeGradientOfWeight(
-    const ElemType* val,
-    const GPUSPARSE_INDEX_TYPE* row,
-    const GPUSPARSE_INDEX_TYPE* pb,
-    size_t mb,
-    size_t nv,
-    const GPUSPARSE_INDEX_TYPE* labelRow,
-    const size_t* labelBlock2UniqId,
-    const ElemType* cls,
-    const ElemType* idx2cls,
-    ElemType* input,
-    size_t nrs,
-    ElemType* blockVal,
-    size_t* blockIds)
-{
-    int p = blockIdx.x;
-    ElemType v = val[p];
-    int i = row[p];
-    int j = -1;
-    for(int k = 1; k < mb; k++) 
-    {
-        if( p < pb[k]) 
-        {
-            j = k-1;
-            break;
-        }
-    }
-    if( j == -1) 
-    {
-        j = mb-1;
-    }
-
-    //figure out blocks
-    int bId = i < nv ? 2*j : 2*j+1;
-    int t = labelRow[bId];
-    int iStt;
-    if(t < nv) 
-    {
-        int clsid = idx2cls[t];
-        iStt = cls[IDX2C(0, clsid, 2)];
-    } 
-    else 
-    {
-        iStt = nv;
-    }
-    int offset = i - iStt;
-    int ii = labelBlock2UniqId[bId] + offset;
-
-    int load = (nrs+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)nrs, load + pStart);
-
-    for(int h = pStart; h < pEnd; h++) 
-    {        
-        ElemType temp = v * input[IDX2C(h, j, nrs)];    
-        atomicAdd(&blockVal[ii*nrs+h], temp);
-        blockIds[ii] = i;
-    }
-}
-
-// used in clipping gradients
-template<class ElemType>
-__global__ void _inplaceTruncate(
-    ElemType* a,
-    const ElemType threshold,
-    const LONG64 N)
-{
-    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    ElemType locThresholdPos = abs(threshold);
-    ElemType locTHresholdNeg = -locThresholdPos; 
-    if (a[id] > locThresholdPos)
-    {
-        a[id] = locThresholdPos;
-    }
-    else if(a[id] < locTHresholdNeg)
-    {
-        a[id] = locTHresholdNeg;
-    }
-}
-
-template<class ElemType>
-__global__ void _normalGrad(
-    bool isBlockCol,
-    size_t len,
-    const ElemType momentum,
-    size_t* blockIds,
-    ElemType* blockVal,
-    ElemType* c,
-    size_t numrows)
-{
-    int j = blockIdx.x;
-    int i = blockIds[j];
-    int start = j * len;
-
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pLen = min((int)len, load + pStart);
-
-    for(int p = start+pStart; p < start+pLen; p++) 
-    {
-        int row = isBlockCol ? (p - start) : i;
-        int col = isBlockCol ? i: (p - start);
-        c[IDX2C(row, col, numrows)] = (1-momentum)*blockVal[p] + momentum*c[IDX2C(row, col, numrows)];
-        blockVal[p] = c[IDX2C(row, col, numrows)];
-    }
-}
-
-static __inline__ __device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-
-template<class ElemType>
-static __inline__ __device__ ElemType logadd(ElemType x, ElemType y)
-{
-    ElemType temp, diff, z; 
-
-    if (x < y) 
-    {
-        temp = x; x = y; y = temp;
-    }
-    diff = y - x; 
-    if (diff < MINLOGEXP)
-    {
-        return (x < LSMALL)?LZERO:x;
-    }
-    else
-    {
-        z = exp(diff);
-        return x + log(1.0 + z);
-    }
-}
-
-//This function should be called with 1024 threads per block and 1 block
-//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
-template<class ElemType>
-__global__ void _reductionSum(
-    const ElemType* data,
-    ElemType *sum,
-    LONG64 N)
-{
-
-    __shared__ ElemType partialSums[1024];
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    LONG64 loadPerThread = N/blockDim.x; 
-    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        partialSums[threadIdx.x]+=data[i];
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        sum[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
-    }
-}
-
-//This function should be called with 1024 threads per block and 1 block
-//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
-template<class ElemType>
-__global__ void _reductionSumAndAssign(
-    ElemType* toAssign,
-    const ElemType* data,
-    LONG64 N, //length of data
-    LONG64 M) //length of toAssign
-{
-    __shared__ ElemType partialSums[1024];
-    __shared__ ElemType res;
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    LONG64 loadPerThread = N/blockDim.x; 
-    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
-    {
-        partialSums[threadIdx.x]+=data[i];
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        res = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
-        for (LONG64 i=0;i<M;++i)
-            toAssign[i]=res;
-    }
-}
-
-//This function should be called with 1024 threads per block and 1 block
-//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
-template<class ElemType>
-__global__ void _reductionSum2(
-    const ElemType* data,
-    ElemType *sum,
-    LONG64 N, 
-    bool takeSqrt=false)
-{
-
-    __shared__ ElemType partialSums[1024];
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    LONG64 loadPerThread = N/blockDim.x; 
-    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
-        //for (int i= threadIdx.x*loadPerThread; i<(threadIdx.x+1)*loadPerThread;++i)
-    {
-        partialSums[threadIdx.x]+=(data[i]*data[i]);
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        sum[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
-        if (takeSqrt)
-        {
-            if (sizeof(ElemType)==sizeof(float))
-                sum[0] = sqrtf(sum[0]);
-            else
-                sum[0] = sqrt(sum[0]); 
-        }
-    }
-}
-
-
-//This function should be called with 1024 threads per block and 1 block
-//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
-template<class ElemType>
-__global__ void _reductionMatrixNormInf(
-    const ElemType* data,
-    ElemType *maxAbs,
-    LONG64 N)
-{
-
-    __shared__ ElemType partialSums[1024];
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    int loadPerThread = N/blockDim.x; 
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)    
-    {
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            partialSums[threadIdx.x]=max(fabsf(data[i]),partialSums[threadIdx.x]);
-        }
-        else
-        {
-            partialSums[threadIdx.x]=max(fabs(data[i]),partialSums[threadIdx.x]);
-        }
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+512],partialSums[threadIdx.x]);        
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+256],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+128],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+64],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+32],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+16],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+8],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+4],partialSums[threadIdx.x]);
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        maxAbs[0] = max(max(partialSums[0],partialSums[1]),max(partialSums[2],partialSums[3]));
-    }
-}
-
-//This function should be called with 1024 threads per block and 1 block
-//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
-template<class ElemType>
-__global__ void _reductionMatrixNorm0(
-    const ElemType* data,
-    ElemType *nz,
-    LONG64 N)
-{
-
-    __shared__ ElemType partialSums[1024];
-    partialSums[threadIdx.x]=0;
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    LONG64 loadPerThread = N/blockDim.x; 
-    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)    
-    {
-        if (data[i]!=0)
-            ++partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //512
-    if (threadIdx.x<512)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+512]+partialSums[threadIdx.x];        
-    }
-    __syncthreads();
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+256]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+128]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+64]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+32]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+16]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+8]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums[threadIdx.x]=partialSums[threadIdx.x+4]+partialSums[threadIdx.x];
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {
-        nz[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
-    }
-}
-
-
-template<class ElemType>
-__global__ void _getSparseVectorRepresntationForCSCMatrix(
-    const int* m_dRow,
-    const int* m_dCol,    
-    int* vectArray,    
-    const long M,
-    const long N)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i>=M)
-        return;
-    int start = m_dRow[i];
-    int end = m_dRow[i+1];
-    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
-    {
-        int j = m_dCol[_i];
-        vectArray[_i] = i*N + j;
-    }
-}
-
-
-template<class ElemType>
-__global__ void _lrHelper(
-    const ElemType* data1,    
-    const ElemType* data2,    
-    const long N,
-    ElemType* d_res)
-{
-    __shared__ ElemType partialSums1[512];
-    __shared__ ElemType partialSums2[512];
-    partialSums1[threadIdx.x]=0;
-    partialSums2[threadIdx.x]=0;
-
-    //int id = blockDim.x * blockIdx.x + threadIdx.x;
-    int loadPerThread = N/blockDim.x;     
-    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)        
-    {
-        partialSums1[threadIdx.x]+=(data1[i]*data1[i]);
-        partialSums2[threadIdx.x]+=(data2[i]*data2[i]);
-    }
-    __syncthreads();
-
-    /*
-    //512
-    if (threadIdx.x<512)
-    {
-    partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+512];
-    partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+512];
-    }
-    __syncthreads();*/
-
-    //256
-    if (threadIdx.x<256)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+256];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+256];        
-    }
-    __syncthreads();
-
-    //128
-    if (threadIdx.x<128)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+128];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+128];        
-    }
-    __syncthreads();
-
-    //64
-    if (threadIdx.x<64)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+64];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+64];        
-    }
-    __syncthreads();
-
-    //32
-    if (threadIdx.x<32)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+32];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+32];        
-    }
-    __syncthreads();
-
-    //16
-    if (threadIdx.x<16)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+16];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+16];        
-    }
-    __syncthreads();
-
-    //8
-    if (threadIdx.x<8)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+8];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+8];        
-    }
-    __syncthreads();
-
-    //4
-    if (threadIdx.x<4)
-    {
-        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+4];
-        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+4];        
-    }
-    __syncthreads();
-
-    if (threadIdx.x==0)
-    {        
-        ElemType fns1 = partialSums1[0]+partialSums1[1]+partialSums1[2]+partialSums1[3];
-        ElemType fns2 = partialSums2[0]+partialSums2[1]+partialSums2[2]+partialSums2[3];
-        if (sizeof(ElemType)==sizeof(float))
-        {                    
-            d_res[0] = max((ElemType)0, d_res[0]/max((ElemType)1.0e-10,sqrtf(fns1))/max((ElemType)1.0e-10,sqrtf(fns2)));            
-        }
-        else
-        {            
-            d_res[0] = max((ElemType)0, d_res[0]/max((ElemType)1.0e-10,sqrt(fns1))/max((ElemType)1.0e-10,sqrt(fns2)));              
-        }   
-    }
-}
-
-/*
-template<class ElemType>
-__global__ void _lrHelper(
-ElemType* d_tmp)
-{
-if (sizeof(ElemType)==sizeof(float))
-{
-d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrtf(d_tmp[1]))/max((ElemType)1.0e-10,sqrtf(d_tmp[2])));            
-}
-else
-{
-d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((ElemType)1.0e-10,sqrt(d_tmp[2])));            
-}
-}
-*/
-
-#endif // !CPUONLY
+//
+// <copyright file="GPUMatrixCUDAKernels.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include "BestGpu.h"
+
+#ifndef CPUONLY
+
+#include <float.h>
+#include <cuda_runtime.h>
+#include "CommonMatrix.h"
+#include "device_functions.h"
+
+
+#ifndef LONG64  //we would like to use 64-bit long to support large matrices. However, CUDA seems to support only 32-bit long
+#define LONG64  long
+#endif
+
+#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
+#define threadsPerBlock 512
+
+// Predefine this for later.
+static __inline__ __device__ double atomicAdd(double* address, double val);
+//CUDA Kernels code
+template<class ElemType>
+__global__ void _elementWisePowerOnCuda(
+    ElemType alpha,     
+    const ElemType *a, 
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (alpha==0)
+    {
+        c[id]=1;
+    }
+    else if (alpha==1)
+    {
+        c[id]=a[id];
+    }
+    else if (alpha==2)
+    {
+        c[id]=a[id]*a[id];
+    }
+    else if (alpha==3)
+    {
+        c[id]=a[id]*a[id]*a[id];
+    }
+    else
+    {
+        if (sizeof(ElemType)==sizeof(double))
+        {
+            c[id]=pow(a[id],alpha);
+        }
+        else
+        {
+            c[id]=powf(a[id],alpha);
+        }
+    }    
+};
+
+template<class ElemType>
+__global__ void _inplaceSigmoidOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        if (c[id]>=0)
+        {
+            double e = exp(-1*c[id]);
+            c[id]=1/(1+e);
+        }
+        else
+        {
+            double e = exp(c[id]);
+            c[id]=e/(1+e);
+        }
+    }
+    else
+    {
+        if (c[id]>=0)
+        {
+            float e = expf(-1*c[id]);
+            c[id]=1/(1+e);
+        }
+        else
+        {
+            float e = exp(c[id]);
+            c[id]=e/(1+e);
+        }
+    }
+};
+
+template<class ElemType>
+__global__ void _assignSigmoidOf(    
+    const ElemType* a,
+    ElemType* res,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        if (a[id]>=0)
+        {
+            double e = exp(-1*a[id]);
+            res[id]=1/(1+e);
+        }
+        else
+        {
+            double e = exp(a[id]);
+            res[id]=e/(1+e);
+        }
+    }
+    else
+    {
+        if (a[id]>=0)
+        {
+            float e = expf(-1*a[id]);
+            res[id]=1/(1+e);
+        }
+        else
+        {
+            float e = exp(a[id]);
+            res[id]=e/(1+e);
+        }
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceLinRectDerivative(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (c[id]<=0)
+        c[id]=0;
+    else
+        c[id]=1;
+}
+
+template<class ElemType>
+__global__ void _assignSigmoidDerivative( 
+    ElemType *a,
+    ElemType *c,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] = a[id] * (1-a[id]);
+}
+
+template<class ElemType>
+__global__ void _inplaceTanhOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=tanh(c[id]);
+    }
+    else
+    {
+        c[id]=tanhf(c[id]);
+    }
+
+};
+
+//to prevent negative values caused by floating operations, we force inputs to be >=0
+//this may, however, hide problems in the caller.
+template<class ElemType>
+__global__ void _inplaceSqrtOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=sqrt(max((ElemType)0, c[id]));
+    }
+    else
+    {
+        c[id]=sqrtf(max(ElemType(0), c[id]));
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceExpOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=exp(c[id]);
+    }
+    else
+    {
+        c[id]=expf(c[id]);
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceLogOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (c[id]<EPS_IN_LOG)
+    {
+        c[id]=LOG_OF_EPS_IN_LOG;
+    }
+    else
+    {
+        if (sizeof(ElemType)==sizeof(double))
+        {
+            c[id]=log(c[id]);
+        }
+        else
+        {
+            c[id]=logf(c[id]);
+        }
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceAbsOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=fabs(c[id]);
+    }
+    else
+    {
+        c[id]=fabsf(c[id]);
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceCosineOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=cos(c[id]);
+    }
+    else
+    {
+        c[id]=cosf(c[id]);
+    }
+};
+
+template<class ElemType>
+__global__ void _inplaceNegativeSineOnCuda(    
+    ElemType* c,    
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(double))
+    {
+        c[id]=-sin(c[id]);
+    }
+    else
+    {
+        c[id]=-sinf(c[id]);
+    }
+};
+
+
+template<class ElemType>
+__global__ void _setValue(    
+    ElemType* a,
+    const ElemType v,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    a[id]=v;
+};
+
+template<class ElemType>
+__global__ void _setValue(    
+    ElemType* a,
+    const ElemType* d_v,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    a[id]=d_v[0];
+};
+
+template<class ElemType>
+__global__ void _assignRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    long col = id / destRows;
+    long row = id - (col * destRows);
+
+    //dest[id] = src[col*srcRows + row + startIndex];
+    dest[id] = src[IDX2C(row + startIndex, col, srcRows)];
+}
+
+template<class ElemType>
+__global__ void _addToRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    long col = id / srcRows;  //src is the full matrix, rowslice is taken from the dest
+    long row = id - (col * srcRows);
+
+    //dest[col*destRows + row + startIndex] += src[id];
+    dest[IDX2C(row + startIndex, col, destRows)] += src[id];
+}
+
+template<class ElemType>
+__global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    long col = id / destRows;  //dest is the full matrix, rowslice is taken from the src
+    long row = id - (col * destRows);
+
+    dest[id] += src[IDX2C(row + startIndex, col, srcRows)];
+}
+
+template<class ElemType>
+__global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    long destCol = id / destRows;
+    long destRow = id - (destCol * destRows);
+    long srcRow = destRow % srcRows;
+    long srcCol = destCol % srcCols;
+
+    dest[id] = src[IDX2C(srcRow,srcCol,srcRows)];
+}
+
+template<class ElemType>
+__global__ void _assignDifferenceOf1(
+    ElemType* us,
+    const ElemType alpha,
+    const ElemType* a,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    us[id]=alpha-a[id];
+};
+
+template<class ElemType>
+__global__ void _assignDifferenceOf2(
+    ElemType* us,
+    const ElemType alpha,
+    const ElemType* a,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    us[id]=a[id]-alpha;
+};
+
+///a is a scalar
+template<class ElemType>
+__global__ void _scaleAndAddScalar(
+    ElemType* c,
+    const LONG64 N,
+    const ElemType alpha,
+    const ElemType* a
+)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] += alpha*a[0];
+};
+
+template<class ElemType>
+__global__ void _addValue(    
+    ElemType* a,
+    const ElemType v,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    a[id]+=v;
+};
+
+template<class ElemType>
+__global__ void _addValue(    
+    ElemType* a,
+    const ElemType* d_v,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    a[id]+=d_v[0];
+};
+
+
+template<class ElemType>
+__global__ void _elemMul(    
+    ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    a[id]*=b[id];
+};
+
+template<class ElemType>
+__global__ void _assignElementProductOf(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    us[id]=a[id]*b[id];
+}
+
+template<class ElemType>
+__global__ void _assignKhatriRaoProductOf(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType* b,
+    const long rowsA, 
+    const long rowsB, 
+    const long cols)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+
+    const long rows = rowsA * rowsB;
+    const long col = id / rows;
+    if (col >= cols) 
+        return; 
+
+    const long row = id % rows;
+    const long rowB = row / rowsA; 
+    const long rowA = row % rowsA;
+
+    us[id] = a[rowA + col * rowsA] * b[rowB + col * rowsB];
+}
+
+template<class ElemType>
+__global__ void _addColumnReshapeProductOf(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType* b,
+    const long rowsB, 
+    const long rowsC, 
+    const long cols,
+    const bool transposeAColumn)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+
+    const long col = id / rowsC;
+    if (col >= cols) 
+        return; 
+
+    const long row = id % rowsC;
+    long bBase = col * rowsB;
+    long aBase = bBase * rowsC;
+    ElemType v = 0;
+
+    if (transposeAColumn)
+    {
+        aBase += row * rowsB;
+        for (long i=0; i<rowsB; i++)
+        {
+            v += a[aBase++] * b[bBase++];
+        }
+    }
+    else
+    {
+        aBase += row;
+        for (long i=0; i<rowsB; i++)
+        {
+            v += a[aBase] * b[bBase++];
+            aBase += rowsC;
+        }
+    }
+    us[row + col * rowsC] += v;
+}
+
+template<class ElemType>
+__global__ void _assignElementDivisionOf(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    ElemType smallValue = EPS_IN_INVERSE;
+
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    ElemType v = b[id];
+
+    if (v <0 && v > -smallValue)
+        us[id] = a[id]/(-smallValue);
+    else if (v >=0 && v < smallValue)
+        us[id] = a[id]/smallValue;
+    else
+        us[id]=a[id]/v;
+}
+
+template<class ElemType>
+__global__ void _elemInverse(
+    ElemType* us,
+    const LONG64 N)
+{
+    ElemType smallValue = EPS_IN_INVERSE;
+
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    if (us[id] <0 && us[id] > -smallValue)
+        us[id] = 1/-smallValue;
+    else if (us[id] >=0 && us[id] < smallValue)
+        us[id] = 1/smallValue;
+    else
+        us[id]=1/us[id];
+}
+
+template<class ElemType>
+__global__ void _logSoftMaxColWise(
+    ElemType *a,
+    const long m_numCols,
+    const long m_numRows) //ld
+{
+    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (col_id>=m_numCols)
+        return;
+
+    __shared__ ElemType maxV[threadsPerBlock];
+    __shared__ ElemType Sum[threadsPerBlock];
+    maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)];
+    Sum[threadIdx.x]=0;
+
+    for (long i=0;i<m_numRows;++i)
+    {
+        if (a[IDX2C(i,col_id,m_numRows)]>maxV[threadIdx.x])
+        {
+            maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)];
+        }
+    }
+
+    for (long i=0;i<m_numRows;++i)
+    {
+        ElemType tmp = a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x];
+        Sum[threadIdx.x] += (sizeof(ElemType)==sizeof(float) ? expf(tmp) : exp(tmp));
+    }
+    Sum[threadIdx.x] = maxV[threadIdx.x] + (sizeof(ElemType)==sizeof(float)?logf(Sum[threadIdx.x]):log(Sum[threadIdx.x]));
+    for (long i=0;i<m_numRows;++i)
+    {
+        a[IDX2C(i,col_id,m_numRows)] -= Sum[threadIdx.x] ;
+    }
+}
+
+//template<class ElemType>
+//__global__ void _assignColumnwiseSoftmaxOf(
+//    const ElemType *a,
+//    ElemType* us,
+//    const long m_numCols,
+//    const long m_numRows) //thead per column
+//{
+//    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+//    if (col_id>=m_numCols)
+//        return;
+//
+//    __shared__ ElemType maxV[threadsPerBlock];
+//    __shared__ ElemType Sum[threadsPerBlock];
+//    maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)];
+//    Sum[threadIdx.x]=0;
+//
+//    for (long i=0;i<m_numRows;++i)
+//    {
+//        if (a[IDX2C(i,col_id,m_numRows)]>maxV[threadIdx.x])
+//        {
+//            maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)];
+//        }
+//    }
+//
+//    for (long i=0;i<m_numRows;++i)
+//    {
+//        if (sizeof(ElemType)==sizeof(float))
+//        {
+//            us[IDX2C(i,col_id,m_numRows)] = expf(a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x]);
+//        }
+//        else
+//        {
+//            us[IDX2C(i,col_id,m_numRows)] = exp(a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x]);
+//        }
+//        Sum[threadIdx.x] +=  us[IDX2C(i,col_id,m_numRows)];
+//    }
+//
+//    for (long i=0;i<m_numRows;++i)
+//    {
+//        us[IDX2C(i,col_id,m_numRows)] /= Sum[threadIdx.x] ;
+//    }
+//}
+
+template<class ElemType>
+__global__ void _assignColumnwiseLogSoftmaxOf(
+    const ElemType *a,
+    ElemType* us,
+    const long m_numCols,
+    const long m_numRows) // each block processes one column. There must be 512 threads in a block
+{
+    //we first find max per column
+    __shared__ ElemType colMax[1];
+    __shared__ ElemType partials[512];    
+    colMax[0]=-10000000;
+    partials[threadIdx.x]=-10000000;
+
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    int loadPerThread = m_numRows/blockDim.x; 
+
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x],a[IDX2C(i,blockIdx.x,m_numRows)]);
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+256],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+128],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+64],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+32],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+16],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+8],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partials[threadIdx.x]=max(partials[threadIdx.x+4],partials[threadIdx.x]);
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        colMax[0] = max(max(partials[0],partials[1]),max(partials[2],partials[3]));        
+    }
+    partials[threadIdx.x]=0.0f;
+    __syncthreads();
+    //end of finding max
+    //now start finding sums
+    __shared__ ElemType colSum[1];
+    colSum[0]=0.0f;
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        ElemType tmp=a[IDX2C(i,blockIdx.x,m_numRows)]-colMax[0];
+        us[IDX2C(i,blockIdx.x,m_numRows)]=tmp;
+        partials[threadIdx.x]+=(sizeof(ElemType)==sizeof(float)?expf(tmp):exp(tmp));
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+256];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+128];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+64];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+32];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+16];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+8];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partials[threadIdx.x]+=partials[threadIdx.x+4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        colSum[0] = partials[0]+partials[1]+partials[2]+partials[3];
+        colSum[0] = (sizeof(ElemType)==sizeof(float)?logf(colSum[0]):log(colSum[0]));
+    }
+    __syncthreads();
+    //end of finding sums
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i)
+    {        
+        us[IDX2C(i,blockIdx.x,m_numRows)]-=colSum[0];        
+    }
+}
+
+template<class ElemType>
+__global__ void _logSoftMaxRowWise(
+    ElemType *a,
+    const long m_numCols,
+    const long m_numRows) //ld
+{
+    int row_id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (row_id>=m_numRows)
+        return;
+
+    __shared__ ElemType maxV[threadsPerBlock];
+    __shared__ ElemType Sum[threadsPerBlock];
+    maxV[threadIdx.x]=a[IDX2C(row_id,0,m_numRows)];
+    Sum[threadIdx.x]=0;
+
+    for (long j=0;j<m_numCols;++j)
+    {
+        if (a[IDX2C(row_id,j,m_numRows)]>maxV[threadIdx.x])
+        {
+            maxV[threadIdx.x]=a[IDX2C(row_id,j,m_numRows)];
+        }
+    }
+
+    for (long j=0;j<m_numCols;++j)
+    {
+        ElemType tmp = a[IDX2C(row_id,j,m_numRows)]-maxV[threadIdx.x];
+        Sum[threadIdx.x] += sizeof(ElemType)==sizeof(float) ? expf(tmp) : exp(tmp);
+    }
+    Sum[threadIdx.x] = maxV[threadIdx.x]+(sizeof(ElemType)==sizeof(float)?logf(Sum[threadIdx.x]):log(Sum[threadIdx.x]));
+    for (long j=0;j<m_numCols;++j)
+    {
+        a[IDX2C(row_id,j,m_numRows)] -= Sum[threadIdx.x] ;
+    }
+}
+
+template<class ElemType>
+__global__ void _inplaceTruncateBottom(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (a[id]<threshold)
+        a[id]=threshold;
+}
+
+template<class ElemType>
+__global__ void _assignTruncateBottom(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (a[id]<threshold)
+        us[id]=threshold;
+    else
+        us[id]=a[id];
+}
+
+template<class ElemType>
+__global__ void _inplaceTruncateTop(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (a[id]>threshold)
+        a[id]=threshold;
+}
+
+template<class ElemType>
+__global__ void _assignTruncateTop(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (a[id]>threshold)
+        us[id]=threshold;
+    else
+        us[id]=a[id];
+}
+
+template<class ElemType>
+__global__ void _setToZeroIfAbsLessThan(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    if (sizeof(ElemType)==sizeof(float))
+    {
+        if (fabsf(a[id])<threshold)
+            a[id]=0;
+    }
+    else
+    {
+        if (fabs(a[id])<threshold)
+            a[id]=0;
+    }
+}
+
+template<class ElemType>
+__global__ void _areEqual(
+    const ElemType* a,
+    const ElemType* b,
+    const LONG64 N,
+    const ElemType threshold,
+    long *d_res)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    if (sizeof(ElemType)==sizeof(float))
+    {
+        if (fabsf(a[id]-b[id]) > threshold) 
+        {
+            d_res[0]=0;
+        }
+    }
+    else
+    {
+        if (fabs(1.0*a[id]-1.0*b[id]) > threshold) 
+        {
+            d_res[0]=0;
+        }
+    }
+
+}
+
+template<class ElemType>
+__global__ void _setDiagonalValue(
+    ElemType* a,
+    const ElemType v,
+    const unsigned long N,
+    const unsigned long ld)
+{
+    int id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;    
+    a[IDX2C(id,id,ld)]=v;
+
+}
+
+template<class ElemType>
+__global__ void _setDiagonalValueFromVector(
+    ElemType* a,
+    const ElemType* b,
+    const long N)
+{
+    int id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return; 
+    a[IDX2C(id,id,N)]=b[id];
+}
+
+template<class ElemType>
+__global__ void _adagrad(
+    ElemType* a,
+    ElemType* d_v,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    const ElemType floor = 1e-16f;
+
+    a[id] += d_v[id] * d_v[id];
+    d_v[id] /= sqrt(a[id]+floor);
+}
+
+template<class ElemType>
+__global__ void _rmsprop_init(
+    ElemType* avars, ElemType* signs, ElemType* steps,
+    ElemType* curr_grad,
+    const LONG64 N
+    )
+{
+    LONG64 i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= N)
+        return;
+
+    ElemType tmp = curr_grad[i];
+    avars[i] = tmp * tmp;
+    signs[i] = ElemType(0.0);
+    steps[i] = ElemType(0.02);
+}
+
+template<class ElemType>
+__global__ void _rmsprop(
+    ElemType* avars, ElemType* signs, ElemType* steps,
+    ElemType* curr_grad,
+    const LONG64 N,
+    ElemType RMS_GAMMA,ElemType RMS_WGT_INC,ElemType RMS_WGT_MAX,ElemType RMS_WGT_DEC,ElemType RMS_WGT_MIN,
+    ElemType floor,
+    ElemType *upd_gpu
+    )
+{
+    LONG64 i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= N)
+        return;
+
+    avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0)-RMS_GAMMA)* (curr_grad[i] * curr_grad[i]);
+
+    //// grad sign base 3: 0->neg, 1->zero, 2->pos
+    //const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
+
+    //// signs[i] contains three consecutive grad_sign
+    //signs[i]  = 3*(int(signs[i]) % 9) + grad_sign;
+
+    //// update according to the following table:
+    //// (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
+    //// (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
+    //// otherwise: no action
+
+    //switch(int(upd_gpu[int(signs[i])]))
+    //{
+    //case 0:
+    //    steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+    //    break;
+    //case 2:
+    //    steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+    //    break;
+    //}
+    //curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
+
+    const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
+
+    if( signs[i] * grad_sign > 0 )
+        steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+    else
+        steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+
+    curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
+    signs[i] = grad_sign;
+
+}
+
+template<class ElemType>
+__global__ void _rescaleToRange(
+    ElemType* a,
+    const LONG64 N,
+    const ElemType low,
+    const ElemType high)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;    
+    a[id]=a[id]*(high-low)+low;
+}
+
+template<class ElemType>
+__global__ void _setMaskAndScale(
+    ElemType* a,
+    const LONG64 N,
+    const ElemType maskRate,
+    const ElemType scaleValue)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;    
+    a[id]=a[id]<=maskRate? 0 : scaleValue;
+}
+
+template<class ElemType>
+__global__ void _vectorNorm1(
+    ElemType* c, //output
+    const ElemType* a, //input
+    const long n, //a.numRows
+    const long m, //a.numCols
+    const bool isColWise)
+{
+    int id = blockDim.x * blockIdx.x + threadIdx.x;
+    if ((isColWise && id>=m)||(!isColWise && id>=n))
+        return;
+
+    ElemType sum = 0;
+
+    if (isColWise)
+    {
+        for (long i=0;i<n;++i)
+        {
+            if (sizeof(ElemType)==sizeof(float))
+            {
+                sum+=fabsf(a[IDX2C(i,id,n)]);
+            }
+            else
+            {
+                sum+=fabs(a[IDX2C(i,id,n)]);
+            }
+        }
+    }
+    else
+    {
+        for (long j=0;j<m;++j)
+        {
+            if (sizeof(ElemType)==sizeof(float))
+            {
+                sum+=fabsf(a[IDX2C(id,j,n)]);
+            }
+            else
+            {
+                sum+=fabs(a[IDX2C(id,j,n)]);
+            }
+        }
+    }
+    c[id]=sum;
+}
+
+
+//one column per thread
+template<class ElemType>
+__global__ void _vectorNorm2(
+    ElemType* c,  //output
+    const ElemType* a, //input
+    const long N, //a.GetNumRows();
+    const long M, //a.GetNumCols();
+    const bool isColWise) 
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if ((isColWise && id>=M) || (!isColWise && id>=N))
+        return;
+
+    ElemType sum = 0;
+    if (isColWise)
+    {
+        for (long i=0;i<N;++i)
+        {
+            ElemType v = a[IDX2C(i,id,N)];
+            sum += v * v;
+        }
+    }
+    else
+    {
+        for (long j=0;j<M;++j)
+        {
+            ElemType v = a[IDX2C(id,j,N)];
+            sum += v * v;
+        }
+    }
+
+    if (sizeof(ElemType) == sizeof(float))
+        c[id] = sqrtf(sum);
+    else
+        c[id] = sqrt(sum);
+}
+
+template<class ElemType>
+__global__ void _convertInd2ValsAdjustInd(
+    ElemType* inds,
+    const ElemType* M,
+    ElemType* vals,    
+    const long n, //number of cols
+    const long m, //number of rows
+    const bool isColWise)
+{
+    int id = blockDim.x * blockIdx.x + threadIdx.x;
+    if ((isColWise && id>=n)||(!isColWise && id>=m))
+        return;
+    inds[id]--;
+    if (isColWise)
+    {
+        vals[id]=M[IDX2C((int)inds[id],id,m)];
+    }
+    else
+    {
+        vals[id]=M[IDX2C(id,(int)inds[id],m)];
+    }
+}
+
+
+    //assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
+template<class ElemType>
+__global__ void _assignPackedConvolutionInput(ElemType * packedMatrix, const ElemType * inputSubBatch, const long batchSize,
+                                                 const long inputWidth, const long inputHeight, const long inputChannels,
+                                                 const long outputWidth, const long outputHeight, const long outputChannels,
+                                                 const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding)
+{
+    const long inputHeightTimesChannel = inputHeight * inputChannels; 
+    const size_t inputDim = inputWidth*inputHeightTimesChannel;
+
+    const long idall = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = idall / inputDim;
+    if (sample >= batchSize) 
+        return; 
+
+    const long id = idall % inputDim;
+    const long y = id / inputHeightTimesChannel; //inputCol
+
+    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
+    const size_t packedInputColsPerSample = outputWidth * outputHeight;  //output size per channel
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
+    // IN_ELEM_COLPOS = sample
+
+    const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels
+    const long x = nXC / inputChannels; //inputRow
+    const long c = nXC % inputChannels; //channel
+
+    ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; 
+
+    long x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    if (zeroPadding)
+    {
+        const long halfKernelWidth = kernelWidth/2; 
+        const long halfKernelHeight = kernelHeight/2; 
+
+        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
+        x1 = x+halfKernelHeight-x0*verticalSubsample;    //first posxInKernel
+        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
+        y1 = y+halfKernelWidth-y0*horizontalSubsample;  //first posyInKernel
+    }
+    else
+    {
+        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
+        x1 = x-x0*verticalSubsample;    //first posxInKernel
+        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
+        y1 = y-y0*horizontalSubsample;  //first posyInKernel
+    }
+
+    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
+    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow
+
+    long packColBase = sample*packedInputColsPerSample + y0*outputHeight; 
+    for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) 
+    {
+        long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
+        for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) 
+        {
+            const long packRow = packRowBase + posxInKernel; 
+            const long packCol = packColBase + wrow; 
+            packedMatrix[packRow + packCol*packedInputRows] = currentInputValue; 
+        }
+        packColBase += outputHeight; 
+    }
+}
+
+    //assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
+template<class ElemType>
+__global__ void _unpackConvolutionInput(const ElemType * packedMatrix, ElemType * inputSubBatch, const long batchSize,
+                                                 const long inputWidth, const long inputHeight, const long inputChannels,
+                                                 const long outputWidth, const long outputHeight, const long outputChannels,
+                                                 const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding)
+{
+    const long inputHeightTimesChannel = inputHeight * inputChannels; 
+    const size_t inputDim = inputWidth*inputHeightTimesChannel;
+
+    const long idall = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = idall / inputDim;
+    if (sample >= batchSize) 
+        return; 
+
+    const long id = idall % inputDim;
+    const long y = id / inputHeightTimesChannel; //inputCol
+
+    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
+    const size_t packedInputColsPerSample = outputWidth * outputHeight;  //output size per channel
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
+    // IN_ELEM_COLPOS = sample
+
+    const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels
+    const long x = nXC / inputChannels; //inputRow
+    const long c = nXC % inputChannels; //channel
+
+    long x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    if (zeroPadding)
+    {
+        const long halfKernelWidth = kernelWidth/2; 
+        const long halfKernelHeight = kernelHeight/2; 
+
+        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
+        x1 = x+halfKernelHeight-x0*verticalSubsample;    //first posxInKernel
+        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
+        y1 = y+halfKernelWidth-y0*horizontalSubsample;  //first posyInKernel
+    }
+    else
+    {
+        x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample));  //row : first wrow in which x is in
+        x1 = x-x0*verticalSubsample;    //first posxInKernel
+        y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample));  //col : first wcol in which y is in
+        y1 = y-y0*horizontalSubsample;  //first posyInKernel
+    }
+
+    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
+    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow
+
+    ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; 
+    long packColBase = sample*packedInputColsPerSample + y0*outputHeight; 
+    for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) 
+    {
+        long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
+        for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) 
+        {
+            const long packRow = packRowBase + posxInKernel; 
+            const long packCol = packColBase + wrow; 
+            currentInputValue += packedMatrix[packRow + packCol*packedInputRows]; 
+        }
+        packColBase += outputHeight; 
+    }
+
+    inputSubBatch[id + sample*inputDim] = currentInputValue; 
+}
+
+template<class ElemType>
+__global__ void _assignMaxPoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels,
+                                                const long inputWidth, const long inputHeight,  const long inputSizePerSample, 
+                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
+                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
+{
+    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = outputIndex / outputSizePerSample; 
+    if (sample >= batchSize) 
+        return; 
+
+    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
+    const long inputHeightTimesChannel = inputHeight * channels; 
+    const long outputHeightTimesChannel = outputHeight * channels; 
+
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
+    // IN_ELEM_COLPOS = sample
+
+    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
+    // OUT_ELEM_COLPOS = sample
+
+    const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol
+    const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels
+    const long x = nXC / channels; //wrow
+    const long c = nXC % channels; //channel
+
+    const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample;
+    register ElemType maxVal = -FLT_MAX; 
+    const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c;
+    for (long colInWindow=0; colInWindow<windowWidth; colInWindow++) 
+    {   
+        long rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
+        for (long rowInWindow=0; rowInWindow<windowHeight; rowInWindow++)
+        {
+            const ElemType val = inputBatchBase4Sample[rowInInput]; 
+            maxVal = max(maxVal, val); 
+            rowInInput += channels;
+        }
+    }
+    outputBatch[outputIndexWithinSample + sample*outputSizePerSample] = maxVal; 
+}
+
+template<class ElemType>
+__global__ void _addMaxPoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, 
+                                                const long batchSize, const long channels, 
+                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
+                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
+                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
+{
+    const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = inputIndex / inputSizePerSample; 
+    if (sample >= batchSize) 
+        return; 
+   
+    const long inputIndexWithinSample = inputIndex % inputSizePerSample; 
+
+    const long inputHeightTimesChannel = inputHeight * channels; 
+    const long outputHeightTimesChannel = outputHeight * channels; 
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
+    // IN_ELEM_COLPOS = sample
+
+    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
+    // OUT_ELEM_COLPOS = sample
+
+    const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input
+    const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels
+    const long x = nXC / channels; //row in input
+    const long c = nXC % channels; //channel
+
+    long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample));  //inclusive start
+    long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end
+    long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample));  //inclusive start
+    long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end
+
+
+    ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample;
+    const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample;
+    const ElemType * outputBatchBase4Sample = outputBatch + sample*outputSizePerSample;
+
+    ElemType inputValue = inputBatch[inputIndexWithinSample + sample*inputSizePerSample];
+    for (long outY=startOutY; outY<=endOutY; outY++)
+    {
+        for (long outX=startOutX; outX<=endOutX; outX++)
+        {
+            long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; 
+            if (inputValue == outputBatchBase4Sample[outputIndex])
+                inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex];
+        }
+    }  
+}
+template<class ElemType>
+__global__ void _assignAveragePoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels,
+                                                const long inputWidth, const long inputHeight,  const long inputSizePerSample, 
+                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
+                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
+{
+    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = outputIndex / outputSizePerSample; 
+    if (sample >= batchSize) 
+        return; 
+
+    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
+    const long inputHeightTimesChannel = inputHeight * channels; 
+    const long outputHeightTimesChannel = outputHeight * channels; 
+
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
+    // IN_ELEM_COLPOS = sample
+
+    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
+    // OUT_ELEM_COLPOS = sample
+
+    const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol
+    const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels
+    const long x = nXC / channels; //wrow
+    const long c = nXC % channels; //channel
+
+    const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample;
+
+    register ElemType average = 0; 
+    const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c;
+    for (long colInWindow=0; colInWindow<windowWidth; colInWindow++) 
+    {   
+        long rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
+        for (long rowInWindow=0; rowInWindow<windowHeight; rowInWindow++)
+        {
+            average += inputBatchBase4Sample[rowInInput]; 
+            rowInInput += channels;
+        }
+    }
+
+    outputBatch[outputIndexWithinSample + sample*outputSizePerSample] = average/windowWidth/windowHeight; 
+}
+
+template<class ElemType>
+__global__ void _addAveragePoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, 
+                                                const long batchSize, const long channels, 
+                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
+                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
+                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
+{
+    const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = inputIndex / inputSizePerSample; 
+    if (sample >= batchSize) 
+        return; 
+   
+    const long inputIndexWithinSample = inputIndex % inputSizePerSample; 
+
+    const long inputHeightTimesChannel = inputHeight * channels; 
+    const long outputHeightTimesChannel = outputHeight * channels; 
+    const long windowSize = windowWidth * windowHeight;
+
+    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
+    // IN_ELEM_COLPOS = sample
+
+    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
+    // OUT_ELEM_COLPOS = sample
+
+    const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input
+    const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels
+    const long x = nXC / channels; //row in input
+    const long c = nXC % channels; //channel
+
+    long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample));  //inclusive start
+    long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end
+    long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample));  //inclusive start
+    long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end
+
+    ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample;
+    const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample;
+
+    for (long outY=startOutY; outY<=endOutY; outY++)
+    {
+        for (long outX=startOutX; outX<=endOutX; outX++)
+        {
+            long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; 
+            inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex]/windowSize;
+        }
+    }  
+}
+
+template<class ElemType>
+__global__ void _addMaxPoolingGradientLoopOut(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, 
+                                                const long batchSize, const long channels, 
+                                                const long inputWidth, const long inputHeight, const long inputSizePerSample, 
+                                                const long outputWidth, const long outputHeight, const long outputSizePerSample, 
+                                                const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample)
+{
+    const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; 
+    const long sample = outputIndex / outputSizePerSample; 
+    if (sample >= batchSize) 
+        return; 
+   
+    const long outputIndexWithinSample = outputIndex % outputSizePerSample; 
+    const long inputWidthTimesChannel = inputWidth * channels; 
+    const long outputWidthTimesChannel = outputWidth * channels; 
+    const long y = outputIndexWithinSample / outputWidthTimesChannel; 
+    const long nXC = outputIndexWithinSample % outputWidthTimesChannel; 
+    const long x = nXC / channels; 
+    const long c = nXC % channels; 
+
+    const long offset0 = sample*inputSizePerSample + y*verticalSubsample*inputWidthTimesChannel + x*horizontalSubsample*channels;
+    const ElemType *pCurWindow4Input = inputBatch + offset0; // pooling to current window's first input pixel 
+    ElemType *pCurWindow4InGradient = inputGradientBatch + offset0; 
+    for (long yy=0; yy<windowHeight; yy++) 
+    {
+        const long offset1 = yy*inputWidthTimesChannel + c; 
+        const ElemType *pf0 = pCurWindow4Input + offset1; 
+        ElemType *pf1 = pCurWindow4InGradient + offset1; 
+        for (long xx=0; xx<windowWidth; xx++)
+        {
+            const long offset2 = xx*channels; 
+            if (pf0[offset2] == outputBatch[outputIndex]) 
+            {
+                pf1[offset2] += outputGradientBatch[outputIndex]; //need to be atomic however atomicAdd on double is not supported.
+            }
+        }
+    }
+}
+
+template<class ElemType>
+__global__ void _addElementProductOf(
+    ElemType* us,
+    const ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    us[id]+=(a[id]*b[id]);
+}
+
+template<class ElemType>
+__global__ void _columnElementMultiplyWith(
+    ElemType* us,
+    const ElemType* a,
+    const long N, //a.GetNumRows();
+    const long M) //us.GetNumCols();
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    //__shared__ ElemType _a[threadsPerBlock];
+    //_a[threadIdx.x]=a[id];
+    ElemType mul=a[id];
+    for (long j=0;j<M;++j)
+    {
+        us[IDX2C(id,j,N)]=us[IDX2C(id,j,N)]*mul;
+    }
+}
+
+template<class ElemType>
+__global__ void _rowElementMultiplyWith(
+    ElemType* us,
+    const ElemType* a,
+    const long N, //us.GetNumRows();
+    const long M) //a.GetNumCols();
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=M)
+        return;
+
+    //__shared__ ElemType _a[threadsPerBlock];
+    //_a[threadIdx.x]=a[id];
+    ElemType mul=a[id];
+    for (long i=0;i<N;++i)
+    {
+        us[IDX2C(i,id,N)]=us[IDX2C(i,id,N)]*mul;
+    }
+}
+
+template<class ElemType>
+__global__ void _rowElementDivideBy(
+    ElemType* us,
+    const ElemType* a,
+    const long N, //us.GetNumRows();
+    const long M) //a.GetNumCols();
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= M)
+        return;
+
+    //__shared__ ElemType _a[threadsPerBlock];
+    //_a[threadIdx.x]=a[id];
+    ElemType v = a[id];
+    if (v >= 0 && v < EPS_IN_INVERSE)
+        v = EPS_IN_INVERSE;
+    else if (v < 0 && v > -EPS_IN_INVERSE)
+        v = (-EPS_IN_INVERSE);
+
+    for (long i = 0; i<N; ++i)
+    {
+        us[IDX2C(i, id, N)] = us[IDX2C(i, id, N)] / v;
+    }
+}
+
+template<class ElemType>
+__global__ void _ColumnElementDivideBy(
+    ElemType* us,
+    const ElemType* a,
+    const long N, //a.GetNumRows();
+    const long M) //us.GetNumCols();
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+
+    ElemType smallValue = EPS_IN_INVERSE;
+
+    //__shared__ ElemType _a[threadsPerBlock];
+    //_a[threadIdx.x]=a[id];
+    ElemType v=a[id];
+    for (long j=0;j<M;++j)
+    {
+        if (v <0 && v > -smallValue)
+            us[IDX2C(id,j,N)] /= (-smallValue);
+        else if (v >=0 && v < smallValue)
+            us[IDX2C(id,j,N)] /= smallValue;
+        else
+            us[IDX2C(id,j,N)] /= v;
+    }
+
+}
+
+
+template<class ElemType>
+__global__ void _innerProduct(
+    ElemType* c,
+    const ElemType* a,
+    const ElemType* b,
+    const long N, //a.GetNumRows();
+    const long M, //a.GetNumCols();
+    const bool isColWise) 
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if ((isColWise && id>=M) || (!isColWise && id>=N))
+        return;
+
+    ElemType sum = 0;
+    long index;
+    if (isColWise)
+    {
+        for (long i=0; i<N; ++i)
+        {
+            index = IDX2C(i,id,N);
+            sum += a[index]* b[index];
+        }
+    }
+    else
+    {
+        for (long j=0; j<M; ++j)
+        {
+            index = IDX2C(id,j, N);
+            sum += a[index]* b[index];
+        }
+    }
+
+    c[id] = sum;
+}
+
+
+template<class ElemType>
+__global__ void _assignSignOf(
+    ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    ElemType v = b[id];
+    a[id] = (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1)));
+}
+
+template<class ElemType>
+__global__ void _addSignOf(
+    ElemType* a,
+    const ElemType* b,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    ElemType v = b[id];
+    a[id] += (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1)));
+}
+
+template<class ElemType>
+__global__ void _vectorMaxMinReduce( //this function processes 1 column per block. this function needs 512 threads
+                                 const ElemType* us,
+                                 ElemType* Indexes,
+                                 ElemType* Values,
+                                 const long m,  //number of rows
+                                 const long n,
+                                 bool isMax)  //number of cols
+{
+    //we first find max per column    
+    __shared__ ElemType partials[512];        
+    __shared__ int partialsInd[512];
+    if (isMax)
+    {
+        partials[threadIdx.x]=-10000000;
+    }
+    else
+    {
+        partials[threadIdx.x]=10000000;
+    }
+    partialsInd[threadIdx.x]=-1;
+
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    int loadPerThread = m/blockDim.x; 
+
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        if (( isMax ? us[IDX2C(i,blockIdx.x,m)]>partials[threadIdx.x] : us[IDX2C(i,blockIdx.x,m)]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=us[IDX2C(i,blockIdx.x,m)];
+            partialsInd[threadIdx.x]=i;       
+        }
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+256],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+256]>partials[threadIdx.x] : partials[threadIdx.x+256]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+256];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+256];
+        }
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+128],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+128]>partials[threadIdx.x] : partials[threadIdx.x+128]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+128];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+128];
+        }
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+64],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+64]>partials[threadIdx.x] : partials[threadIdx.x+64]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+64];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+64];
+        }
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+32],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+32]>partials[threadIdx.x] : partials[threadIdx.x+32]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+32];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+32];
+        }
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+16],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+16]>partials[threadIdx.x] : partials[threadIdx.x+16]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+16];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+16];
+        }
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+8],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+8]>partials[threadIdx.x] : partials[threadIdx.x+8]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+8];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+8];
+        }
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        //partials[threadIdx.x]=max(partials[threadIdx.x+4],partials[threadIdx.x]);
+        if ((isMax ? partials[threadIdx.x+4]>partials[threadIdx.x] : partials[threadIdx.x+4]<partials[threadIdx.x]) || partialsInd[threadIdx.x]==-1)
+        {
+            partials[threadIdx.x]=partials[threadIdx.x+4];
+            partialsInd[threadIdx.x]=partialsInd[threadIdx.x+4];
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        ElemType mx = partials[0];
+        int ind = partialsInd[0];
+        if ((isMax ? mx<partials[1] : mx>partials[1]) || ind ==-1)
+        {
+            mx = partials[1];
+            ind = partialsInd[1];
+        }
+        if ((isMax ? mx<partials[2] : mx>partials[2]) || ind ==-1)
+        {
+            mx = partials[2];
+            ind = partialsInd[2];
+        }
+        if ((isMax ? mx<partials[3] : mx>partials[3]) || ind ==-1)
+        {
+            mx = partials[3];
+            ind = partialsInd[3];
+        }
+        Values[blockIdx.x] = mx;
+        Indexes[blockIdx.x] = ind;
+    }
+}
+
+template<class ElemType>
+__global__ void _vectorMax(
+    const ElemType* us,
+    ElemType* maxIndexes,
+    ElemType* maxValues,
+    const long m,  //number of rows
+    const long n,  //number of cols
+    const bool isColWise) 
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    long maxInd = -1;
+    ElemType maxVal = -100000;
+
+    if (isColWise)
+    {
+        if (id>=n)
+            return;
+
+        for (long i=0;i<m;i++)
+        {
+            if (maxInd==-1 || us[IDX2C(i,id,m)]>=maxVal)
+            {
+                maxInd = i;
+                maxVal = us[IDX2C(i,id,m)];
+            }
+        }
+    }
+    else
+    {
+        if (id>=m)
+            return;
+
+        for (long j=0;j<n;j++)
+        {
+            if (maxInd==-1 || us[IDX2C(id,j,m)]>=maxVal)
+            {
+                maxInd = j;
+                maxVal = us[IDX2C(id,j,m)];
+            }
+        }
+    }
+    maxIndexes[id]=maxInd;
+    maxValues[id]=maxVal;
+}
+
+template<class ElemType>
+__global__ void _vectorMin(
+    const ElemType* us,
+    ElemType* minIndexes,
+    ElemType* minValues,
+    const long m,  //number of rows
+    const long n,  //number of cols
+    const bool isColWise) 
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    long minInd = -1;
+    ElemType minVal = -100000;
+
+    if (isColWise)
+    {
+        if (id>=n)
+            return;
+
+        for (long i=0;i<m;i++)
+        {
+            if (minInd==-1 || us[IDX2C(i,id,m)]<=minVal)
+            {
+                minInd = i;
+                minVal = us[IDX2C(i,id,m)];
+            }
+        }
+    }
+    else
+    {
+        if (id>=m)
+            return;
+
+        for (long j=0;j<n;j++)
+        {
+            if (minInd==-1 || us[IDX2C(id,j,m)]<=minVal)
+            {
+                minInd = j;
+                minVal = us[IDX2C(id,j,m)];
+            }
+        }
+    }
+    minIndexes[id]=minInd;
+    minValues[id]=minVal;
+}
+
+//this implementation uses more threads but also more memory access
+template<class ElemType>
+__global__ void _matrixVectorColumnWiseAddWithThreadPerElem(
+    const ElemType* a,
+    ElemType* us,
+    ElemType alpha,
+    const long m,  //number of rows
+    const long n)  //number of cols     
+{
+    long id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= m*n)
+        return;
+
+    long col = id / m;
+    long row = id - col*m;
+
+    us[id] += alpha*a[row];
+}
+
+template<class ElemType>
+__global__ void _matrixVectorColumnWiseAddWithThreadPerRow(
+    const ElemType* a,
+    ElemType* us,
+    ElemType alpha,
+    const long m,  //number of rows
+    const long n)  //number of cols     
+{
+#ifdef VALIDATION
+    if (blockDim.x * blockIdx.x + threadIdx.x == 0)
+    {
+        printf("** _matrixVectorColumnWiseAdd on device:\na = %p, us = %p, alpha = %f, m = %ld, n = %ld\n", 
+            a,us,alpha,m,n);
+        printf("us[0] = %f\n", us[0]);
+        printf("a[0] = %f\n", a[0]);
+    }
+#endif
+    int id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=m)
+        return;
+    ElemType tmp = a[id];
+#ifdef VALIDATION
+    printf("  a[%d] = %f\n", id, tmp);
+#endif
+    for (long j = 0; j < n; ++j )
+    {
+        us[j*m+id] += alpha*tmp;
+    }
+ 
+}
+
+
+template<class ElemType>
+__global__ void _matrixVectorColumnWiseAddBlockPerRow(
+    const ElemType* a,
+    ElemType* us,
+    ElemType alpha,
+    const long m,  //number of rows
+    const long n)  //number of cols     
+{    
+    ElemType tmp;
+
+    if (threadIdx.x==0)
+    {
+        tmp = a[blockIdx.x];
+    }
+    __syncthreads();
+
+    int loadPerThread = n/blockDim.x; 
+
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? n : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        us[m*blockIdx.x + i] += alpha*tmp;
+    }
+}
+
+
+
+template<class ElemType>
+__global__ void _addScaledDifference( 
+    ElemType alpha,
+    ElemType *a,
+    ElemType *b,
+    ElemType *c,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] = c[id] + (a[id]-b[id]) * (alpha);
+}
+
+template<class ElemType>
+__global__ void _assignScaledDifference( 
+    ElemType alpha,
+    ElemType *a,
+    ElemType *b,
+    ElemType *c,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] = (a[id]-b[id]) * (alpha);
+}
+
+template<class ElemType>
+__global__ void _addScaledDifference( 
+    ElemType *alpha,
+    ElemType *a,
+    ElemType *b,
+    ElemType *c,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] = c[id] + (a[id]-b[id]) * alpha[0];
+}
+
+template<class ElemType>
+__global__ void _assignScaledDifference( 
+    ElemType *alpha,
+    ElemType *a,
+    ElemType *b,
+    ElemType *c,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    c[id] = (a[id]-b[id]) * alpha[0];
+}
+
+template<class ElemType>
+__global__ void _addElementToElement( 
+    const ElemType *a, LONG64 indexA,
+    ElemType *c, LONG64 indexC)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>0)
+        return;
+    c[indexC] += a[indexA];
+}
+
+template<class ElemType>
+__global__ void _assignNumOfDiff( 
+    const ElemType *a,
+    const ElemType *b,
+    ElemType *c,
+    LONG64 N)
+{
+    __shared__ ElemType partialSums[1024];
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 loadPerThread = N/blockDim.x; 
+    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        partialSums[threadIdx.x]+=(a[i] != b[i]);
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        c[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
+    }
+}
+
+
+/*template<class ElemType>
+__global__ void _assignNumOfDiff( 
+ElemType *a,
+ElemType *b,
+ElemType *c,
+long N)
+{
+//TO DO: replace atomic operation with reduction
+
+__shared__ int totalSum;
+if (threadIdx.x == 0) totalSum = 0;
+__syncthreads();
+
+int id = blockDim.x * blockIdx.x + threadIdx.x;
+if (id>=N)
+return;
+
+int localVal = (a[id] != b[id]);
+atomicAdd(&totalSum, localVal);
+__syncthreads();
+
+c[id] = totalSum;
+}*/
+
+template<class ElemType>
+__global__ void _scaleArray(
+    ElemType alpha,
+    ElemType *us,
+    LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    us[id]=us[id]*alpha;
+}
+
+
+template<class ElemType>
+__global__ void _sparseCSRPlusDense(
+    ElemType alpha,
+    const ElemType* m_dVal,
+    const int* m_dRow,
+    const int* m_dCol,
+    ElemType* pArrayDev,
+    LONG64 M)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=M)
+        return;
+    int start = m_dRow[id];
+    int end = m_dRow[id+1];
+    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
+    {
+        int j = m_dCol[_i];
+        pArrayDev[IDX2C(id,j,M)]+=(alpha*m_dVal[_i]);
+    }
+}
+
+template<class ElemType>
+__global__ void _sparseCSRElemMulDense(    
+    const ElemType* m_dVal,
+    const int* m_dRow,
+    const int* m_dCol,
+    const ElemType* b,
+    ElemType* c,
+    LONG64 M)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=M)
+        return;
+    int start = m_dRow[id];
+    int end = m_dRow[id+1];
+    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
+    {
+        int j = m_dCol[_i];
+        c[IDX2C(id,j,M)]=b[IDX2C(id,j,M)]*m_dVal[_i];
+    }
+}
+
+
+//c = alpha * op(a) * op(b) + beta*c
+//this function can be further improved by using shared memory
+template<class ElemType>
+__global__ void _denseMultSparseCSCAndWeightedAddToDense(
+    int m, //rowDense
+    int n,   //colSparse
+    ElemType alpha,
+    const ElemType* a,  //dense
+    const ElemType* bnzValues,  //sparse nz values
+    const GPUSPARSE_INDEX_TYPE* rowIndex,
+    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
+    ElemType beta,
+    ElemType* c  //dense target
+    )
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= m*n)  
+        return;
+
+    int colInC = id / m;
+    int rowInC = id - colInC * m;
+
+    int start = colCSCIndex[colInC]; 
+    int end = colCSCIndex[colInC + 1];
+
+    ElemType s = 0;
+   for (int j = start; j<end; j++)  //j points to the value
+    {
+        int i = rowIndex[j];
+        s += a[IDX2C(rowInC, i, m)] * bnzValues[j];
+    }
+    c[IDX2C(rowInC, colInC, m)] = alpha * s + beta * c[IDX2C(rowInC, colInC, m)];
+}
+
+// backward pass from hidden layer to feature weight
+//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
+//assume resultValues are 0-initialized
+template<class ElemType>
+__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
+    const ElemType alpha,
+    const ElemType* lhsValues,
+    const size_t numRowsLhs,
+    const size_t numColsRhs,
+    const ElemType* rhsNZValues,
+    const GPUSPARSE_INDEX_TYPE* rhsRows,
+    const GPUSPARSE_INDEX_TYPE* rhsCols,
+    const size_t* rhsRowIdx,
+    ElemType* resultValues,
+    size_t* resultBlockIds)
+{
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    const LONG64 lhsCol = index / numRowsLhs; //rhsCol == lhsCol
+    if (lhsCol >= numColsRhs)
+        return;
+    const LONG64 lhsRow = index - numRowsLhs*lhsCol; //resultRow == lhsRow
+
+    //each thread handles one [row, col] combination
+    ElemType lhsValue = alpha*lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];
+
+    LONG64 start = rhsCols[lhsCol]; //rhsCol == lhsCol
+    LONG64 end = rhsCols[lhsCol + 1];
+
+    for (LONG64 p = start; p < end; p++)
+    {
+        LONG64 rhsRow = rhsRows[p]; 
+        ElemType rhsVal = rhsNZValues[p];
+        LONG64 resultCol = rhsRowIdx[p]; //resultCol == rhsRow maps to columnid 
+        resultBlockIds[resultCol] = rhsRow;  //indicate which colmn it actually points to
+
+        //assume resultValues are 0-initialized
+        atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal);
+    }
+}
+
+
+// gradients update
+template<class ElemType>
+__global__ void _scaleSparseBlockAndAddToDense(    
+    const ElemType alpha,
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    const ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
+{
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
+    {
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
+    }
+    else
+    {
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
+    }
+    rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
+}
+
+// compute predictions in cross entory node
+template<class ElemType>
+__global__ void _computePrediction(
+    int nv,
+    const ElemType* a,
+    int numrows,
+    const ElemType* weight,   
+    int nrs,
+    int labelSize,
+    const GPUSPARSE_INDEX_TYPE* labelRow,
+    const size_t* block2Id,
+    const ElemType* cls,
+    const ElemType* idx2cls,    
+    ElemType* val,
+    GPUSPARSE_INDEX_TYPE* row,
+    GPUSPARSE_INDEX_TYPE* pb)
+{
+    // get label block id
+    int id = -1;
+    int offset = -1;
+    for(int i = 1; i < labelSize; i++) 
+    {
+        if (blockIdx.x < block2Id[i]) 
+        {
+            id = i-1;
+            offset = blockIdx.x - block2Id[i-1];
+            break;
+        }
+    }
+    if( id == -1) 
+    {
+        id = labelSize-1;
+        offset = blockIdx.x - block2Id[labelSize-1];
+    }
+
+    int t = labelRow[id];
+    int iStt;
+    int iEnd;
+    if(t < nv) 
+    {
+        int clsid = idx2cls[t];
+        iStt = cls[IDX2C(0, clsid, 2)];
+        iEnd = cls[IDX2C(1, clsid, 2)];
+    } 
+    else 
+    {
+        iStt = nv;
+        iEnd = nrs;
+    }
+    int i = iStt + offset;
+    int j = id /2;
+    
+    int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
+    int tStart = loadPerThread * threadIdx.x;
+    int tEnd = min((int)numrows, loadPerThread + tStart);
+
+    ElemType v = 0.0;
+    for (int h = tStart; h < tEnd; h++)
+    {
+        v += weight[IDX2C(i,h,nrs)] * a[IDX2C(h,j,numrows)]; 
+    }
+    atomicAdd(&val[blockIdx.x], v);
+    row[blockIdx.x] = i;
+
+    if(blockIdx.x == 0 && threadIdx.x == 0) 
+        pb[0] = 0;
+    
+    if((threadIdx.x == 0) && (i == iEnd-1) && (i >= nv)) 
+        pb[j+1] = blockIdx.x+1;
+}
+
+// normalize predictions in cross entropy node
+template<class ElemType>
+__global__ void _normalizePrediction(
+    const size_t labelSize,
+    const size_t expandedLabelSize,
+    const GPUSPARSE_INDEX_TYPE* labelRow,
+    const size_t* block2Id,    
+    const GPUSPARSE_INDEX_TYPE* row,
+    ElemType* val,
+    ElemType* entropyScore)
+{    
+    __shared__ ElemType partials[512];
+    partials[threadIdx.x] = 0;
+
+    int p = blockIdx.x;
+    int t = labelRow[p];
+    int start = block2Id[p];
+    int end;
+    if(p == labelSize -1) 
+    {
+        end = expandedLabelSize;
+    } 
+    else 
+    {
+        end = block2Id[p+1];
+    }
+    int len = end - start;
+
+    int loadPerThread = (len+blockDim.x-1)/blockDim.x;
+    int tStart = loadPerThread * threadIdx.x;
+    int tLen = min((int)len, loadPerThread + tStart);
+
+    for(int i = start + tStart; i < start + tLen; i++) 
+    {
+        partials[threadIdx.x] += exp(val[i]);
+    }
+
+    __syncthreads();
+
+    // now sum up the objective function
+    int nTotalThreads = blockDim.x;
+
+    while (nTotalThreads >1)
+    {
+        int halfPoint = (nTotalThreads >> 1);
+
+        if (threadIdx.x < halfPoint)
+            partials[threadIdx.x] += partials[threadIdx.x+halfPoint];
+
+        __syncthreads();
+
+        nTotalThreads = (nTotalThreads>>1);
+    }
+    
+    for(int i = start + tStart; i < start + tLen; i++) 
+    {
+        val[i] = log(exp(val[i])/partials[0]);
+        if(row[i] == t) 
+        {
+            atomicAdd(entropyScore, -val[i]);
+            val[i] *= -1;
+        }
+    }
+}
+
+// compute prediction error in cross entropy node
+template<class ElemType>
+__global__ void _computePredictionError(
+    ElemType* val,
+    int N)
+{    
+    int p = blockDim.x * blockIdx.x + threadIdx.x;
+    if (p>=N)
+        return;
+
+    if(val[p] < 0) 
+        val[p] = exp(val[p]); //negative;
+    else 
+        val[p] = exp(-val[p])-1; //positive
+}
+
+// compute gradients of input in cross entropy node
+template<class ElemType>
+__global__ void _computeGradientOfInput(
+    const ElemType* val,
+    const GPUSPARSE_INDEX_TYPE* row,
+    const GPUSPARSE_INDEX_TYPE* pb,    
+    ElemType* weight,
+    size_t nrs,
+    ElemType* grd,
+    size_t numrows)
+{        
+    int h = blockIdx.x%numrows;
+    int j = blockIdx.x/numrows;
+
+    int start = pb[j];
+    int end = pb[j+1];
+    int len = end - start;
+    
+    int load = (len+blockDim.x-1)/blockDim.x;
+    int pStart = start + load * threadIdx.x;
+    int pEnd = start + min(len, load * (threadIdx.x+1));
+
+    ElemType sum = 0;
+    for(int p = pStart; p < pEnd; p++) 
+    {
+        int i = row[p];
+        sum += val[p] * weight[IDX2C(i, h, nrs)]; 
+    }    
+
+    atomicAdd(&grd[IDX2C(h,j,numrows)], sum);
+}
+
+// compute gradients of weights in cross entropy node
+template<class ElemType>
+__global__ void _computeGradientOfWeight(
+    const ElemType* val,
+    const GPUSPARSE_INDEX_TYPE* row,
+    const GPUSPARSE_INDEX_TYPE* pb,
+    size_t mb,
+    size_t nv,
+    const GPUSPARSE_INDEX_TYPE* labelRow,
+    const size_t* labelBlock2UniqId,
+    const ElemType* cls,
+    const ElemType* idx2cls,
+    ElemType* input,
+    size_t nrs,
+    ElemType* blockVal,
+    size_t* blockIds)
+{
+    int p = blockIdx.x;
+    ElemType v = val[p];
+    int i = row[p];
+    int j = -1;
+    for(int k = 1; k < mb; k++) 
+    {
+        if( p < pb[k]) 
+        {
+            j = k-1;
+            break;
+        }
+    }
+    if( j == -1) 
+    {
+        j = mb-1;
+    }
+
+    //figure out blocks
+    int bId = i < nv ? 2*j : 2*j+1;
+    int t = labelRow[bId];
+    int iStt;
+    if(t < nv) 
+    {
+        int clsid = idx2cls[t];
+        iStt = cls[IDX2C(0, clsid, 2)];
+    } 
+    else 
+    {
+        iStt = nv;
+    }
+    int offset = i - iStt;
+    int ii = labelBlock2UniqId[bId] + offset;
+
+    int load = (nrs+blockDim.x-1)/blockDim.x;
+    int pStart = load * threadIdx.x;
+    int pEnd = min((int)nrs, load + pStart);
+
+    for(int h = pStart; h < pEnd; h++) 
+    {        
+        ElemType temp = v * input[IDX2C(h, j, nrs)];    
+        atomicAdd(&blockVal[ii*nrs+h], temp);
+        blockIds[ii] = i;
+    }
+}
+
+// used in clipping gradients
+template<class ElemType>
+__global__ void _inplaceTruncate(
+    ElemType* a,
+    const ElemType threshold,
+    const LONG64 N)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id>=N)
+        return;
+    ElemType locThresholdPos = abs(threshold);
+    ElemType locTHresholdNeg = -locThresholdPos; 
+    if (a[id] > locThresholdPos)
+    {
+        a[id] = locThresholdPos;
+    }
+    else if(a[id] < locTHresholdNeg)
+    {
+        a[id] = locTHresholdNeg;
+    }
+}
+
+template<class ElemType>
+__global__ void _normalGradForSparseBlock(
+    const ElemType momentum,
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
+{
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
+    {
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
+    }
+    else
+    {
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
+    }
+    rhs[IDX2C(row, col, numRows)] = (1 - momentum)*lhsValues[index] + momentum*rhs[IDX2C(row, col, numRows)];
+    lhsValues[index] = rhs[IDX2C(row, col, numRows)];
+}
+
+static __inline__ __device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+
+template<class ElemType>
+static __inline__ __device__ ElemType logadd(ElemType x, ElemType y)
+{
+    ElemType temp, diff, z; 
+
+    if (x < y) 
+    {
+        temp = x; x = y; y = temp;
+    }
+    diff = y - x; 
+    if (diff < MINLOGEXP)
+    {
+        return (x < LSMALL)?LZERO:x;
+    }
+    else
+    {
+        z = exp(diff);
+        return x + log(1.0 + z);
+    }
+}
+
+//This function should be called with 1024 threads per block and 1 block
+//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
+template<class ElemType>
+__global__ void _reductionSum(
+    const ElemType* data,
+    ElemType *sum,
+    LONG64 N)
+{
+
+    __shared__ ElemType partialSums[1024];
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 loadPerThread = N/blockDim.x; 
+    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        partialSums[threadIdx.x]+=data[i];
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        sum[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
+    }
+}
+
+//This function should be called with 1024 threads per block and 1 block
+//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
+template<class ElemType>
+__global__ void _reductionSumAndAssign(
+    ElemType* toAssign,
+    const ElemType* data,
+    LONG64 N, //length of data
+    LONG64 M) //length of toAssign
+{
+    __shared__ ElemType partialSums[1024];
+    __shared__ ElemType res;
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 loadPerThread = N/blockDim.x; 
+    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
+    {
+        partialSums[threadIdx.x]+=data[i];
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        res = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
+        for (LONG64 i=0;i<M;++i)
+            toAssign[i]=res;
+    }
+}
+
+//This function should be called with 1024 threads per block and 1 block
+//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
+template<class ElemType>
+__global__ void _reductionSum2(
+    const ElemType* data,
+    ElemType *sum,
+    LONG64 N, 
+    bool takeSqrt=false)
+{
+
+    __shared__ ElemType partialSums[1024];
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 loadPerThread = N/blockDim.x; 
+    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)
+        //for (int i= threadIdx.x*loadPerThread; i<(threadIdx.x+1)*loadPerThread;++i)
+    {
+        partialSums[threadIdx.x]+=(data[i]*data[i]);
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+512];
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+256];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+128];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+64];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+32];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+16];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+8];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]+=partialSums[threadIdx.x+4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        sum[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
+        if (takeSqrt)
+        {
+            if (sizeof(ElemType)==sizeof(float))
+                sum[0] = sqrtf(sum[0]);
+            else
+                sum[0] = sqrt(sum[0]); 
+        }
+    }
+}
+
+
+//This function should be called with 1024 threads per block and 1 block
+//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
+template<class ElemType>
+__global__ void _reductionMatrixNormInf(
+    const ElemType* data,
+    ElemType *maxAbs,
+    LONG64 N)
+{
+
+    __shared__ ElemType partialSums[1024];
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    int loadPerThread = N/blockDim.x; 
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)    
+    {
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            partialSums[threadIdx.x]=max(fabsf(data[i]),partialSums[threadIdx.x]);
+        }
+        else
+        {
+            partialSums[threadIdx.x]=max(fabs(data[i]),partialSums[threadIdx.x]);
+        }
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+512],partialSums[threadIdx.x]);        
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+256],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+128],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+64],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+32],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+16],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+8],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]=max(partialSums[threadIdx.x+4],partialSums[threadIdx.x]);
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        maxAbs[0] = max(max(partialSums[0],partialSums[1]),max(partialSums[2],partialSums[3]));
+    }
+}
+
+//This function should be called with 1024 threads per block and 1 block
+//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
+template<class ElemType>
+__global__ void _reductionMatrixNorm0(
+    const ElemType* data,
+    ElemType *nz,
+    LONG64 N)
+{
+
+    __shared__ ElemType partialSums[1024];
+    partialSums[threadIdx.x]=0;
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 loadPerThread = N/blockDim.x; 
+    for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)    
+    {
+        if (data[i]!=0)
+            ++partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //512
+    if (threadIdx.x<512)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+512]+partialSums[threadIdx.x];        
+    }
+    __syncthreads();
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+256]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+128]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+64]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+32]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+16]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+8]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums[threadIdx.x]=partialSums[threadIdx.x+4]+partialSums[threadIdx.x];
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {
+        nz[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3];
+    }
+}
+
+
+template<class ElemType>
+__global__ void _getSparseVectorRepresntationForCSCMatrix(
+    const int* m_dRow,
+    const int* m_dCol,    
+    int* vectArray,    
+    const long M,
+    const long N)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i>=M)
+        return;
+    int start = m_dRow[i];
+    int end = m_dRow[i+1];
+    for (int _i=start;_i<end;++_i)  //_i is index in m_dVal and m_dCol
+    {
+        int j = m_dCol[_i];
+        vectArray[_i] = i*N + j;
+    }
+}
+
+
+template<class ElemType>
+__global__ void _lrHelper(
+    const ElemType* data1,    
+    const ElemType* data2,    
+    const long N,
+    ElemType* d_res)
+{
+    __shared__ ElemType partialSums1[512];
+    __shared__ ElemType partialSums2[512];
+    partialSums1[threadIdx.x]=0;
+    partialSums2[threadIdx.x]=0;
+
+    //int id = blockDim.x * blockIdx.x + threadIdx.x;
+    int loadPerThread = N/blockDim.x;     
+    for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i)        
+    {
+        partialSums1[threadIdx.x]+=(data1[i]*data1[i]);
+        partialSums2[threadIdx.x]+=(data2[i]*data2[i]);
+    }
+    __syncthreads();
+
+    /*
+    //512
+    if (threadIdx.x<512)
+    {
+    partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+512];
+    partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+512];
+    }
+    __syncthreads();*/
+
+    //256
+    if (threadIdx.x<256)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+256];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+256];        
+    }
+    __syncthreads();
+
+    //128
+    if (threadIdx.x<128)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+128];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+128];        
+    }
+    __syncthreads();
+
+    //64
+    if (threadIdx.x<64)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+64];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+64];        
+    }
+    __syncthreads();
+
+    //32
+    if (threadIdx.x<32)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+32];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+32];        
+    }
+    __syncthreads();
+
+    //16
+    if (threadIdx.x<16)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+16];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+16];        
+    }
+    __syncthreads();
+
+    //8
+    if (threadIdx.x<8)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+8];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+8];        
+    }
+    __syncthreads();
+
+    //4
+    if (threadIdx.x<4)
+    {
+        partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+4];
+        partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+4];        
+    }
+    __syncthreads();
+
+    if (threadIdx.x==0)
+    {        
+        ElemType fns1 = partialSums1[0]+partialSums1[1]+partialSums1[2]+partialSums1[3];
+        ElemType fns2 = partialSums2[0]+partialSums2[1]+partialSums2[2]+partialSums2[3];
+        if (sizeof(ElemType)==sizeof(float))
+        {                    
+            d_res[0] = max((ElemType)0, d_res[0]/max((ElemType)1.0e-10,sqrtf(fns1))/max((ElemType)1.0e-10,sqrtf(fns2)));            
+        }
+        else
+        {            
+            d_res[0] = max((ElemType)0, d_res[0]/max((ElemType)1.0e-10,sqrt(fns1))/max((ElemType)1.0e-10,sqrt(fns2)));              
+        }   
+    }
+}
+
+/*
+template<class ElemType>
+__global__ void _lrHelper(
+ElemType* d_tmp)
+{
+if (sizeof(ElemType)==sizeof(float))
+{
+d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrtf(d_tmp[1]))/max((ElemType)1.0e-10,sqrtf(d_tmp[2])));            
+}
+else
+{
+d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((ElemType)1.0e-10,sqrt(d_tmp[2])));            
+}
+}
+*/
+
+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShiftNeg(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const int NTPlusOne,
+	const int BS)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= BS)
+		return;
+
+	if (idx == 0)
+	{
+		// this is row-0. No need to shift
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[idy];
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		int tmpidy = (idy + cs) % BS;
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[tmpidy];
+	}
+}
+
+template<class ElemType>
+__global__ void _innerProductWithShiftNeg(
+	ElemType* c,
+	const ElemType* a,
+	const ElemType* b,
+	const long N, //a.GetNumRows();
+	const long M, //a.GetNumCols();
+	const long shift,
+	const long NTPlusOne
+	)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= M)
+		return;
+
+	ElemType sum = 0;
+	long index_a = 0;
+	long index_b = 0;
+	long col_a = 0;
+	long col_b = 0;
+	if (idx == 0)
+	{
+		// this is row 0. No need to shift
+		// the product of a(:,idy) dot b(:,idy)
+		col_a = idy;
+		for (long i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			sum += a[index_a] * b[index_a];
+		}
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		col_a = idy;
+		col_b = (idy + cs) % M;
+		for (int i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			index_b = IDX2C(i, col_b, N);
+			sum += a[index_a] * b[index_b];
+		}
+	}
+	c[IDX2C(idx, idy, NTPlusOne)] = sum;
+
+}
+
+template<class ElemType>
+__global__ void _getARowByIndex(
+	ElemType* us,
+	const ElemType* a,
+	const int O, // a's rows
+	const int P, // a's cols
+	const int m // the m-th row of a
+	)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= P)
+		return;
+	//	us[id] = a[id] * b[id];
+	us[id] = a[IDX2C(m, id, O)];
+}
+
+
+template<class ElemType>
+__global__ void _conductRowElementMultiplyWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int O, // b's rows
+	const int P, // b's cols
+	const int shift,
+	const bool isafixed)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= O || idy >= P)
+		return;
+
+	int tmpidy = (idy + shift) % P;
+	if (isafixed)
+	{
+		// we fix a, and shift b
+		us[IDX2C(idx, idy, O)] = a[idy] * b[IDX2C(idx, tmpidy, O)];
+	}
+	else
+	{
+		// we fix b, but shift a
+		us[IDX2C(idx, idy, O)] = a[tmpidy] * b[IDX2C(idx, idy, O)];
+	}
+
+}
+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const LONG64 N)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= N)
+		return;
+
+	int tmpidb = (id + shift) % N;
+	us[id] = a[id] * b[tmpidb];
+}
+
+
+
+#endif // !CPUONLY
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index fa16acc71..a81e0d5e1 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -1,2479 +1,2474 @@
-//
-// <copyright file="GPUSparseMatrix.cu" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#include "BestGpu.h"
-
-#ifndef CPUONLY
-
-#include "GPUSparseMatrix.h"
-#include "GPUMatrix.h"
-#include <cuda_runtime.h>
-#include <cusparse_v2.h>
-#include "cublas_v2.h"
-#include "GPUMatrixCUDAKernels.cu"
-#include <functional>
-#include "CommonMatrix.h"
-#include <iostream> // for cout
-#include <assert.h>
-
-#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
-#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
-
-#ifdef    _WIN32
-// thread local storage to access the current stream, initalize to default stream
-extern __declspec (thread)
-#endif
-cudaStream_t t_stream;
-
-
-void CUDACALL(cudaError_t x) 
-{
-    if(x!=cudaSuccess) 
-    { 
-        const char* errmsg = cudaGetErrorString(x);
-        std::cerr<< "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl;
-
-        throw std::runtime_error(errmsg);
-    }    
-}
-
-void CUSPARSECALL(cusparseStatus_t x) 
-{
-    if(x!= CUSPARSE_STATUS_SUCCESS) 
-    {         
-        std::cerr << "!!!!!!!!CUSPARSE EXCEPTION: " << std::endl;
-        throw std::runtime_error("CUSPARSE EXCEPTION");
-    }    
-}
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-#pragma region Constructors and Destructor
-
-#ifdef NO_SYNC
-    template<class ElemType> bool GPUSparseMatrix<ElemType>::do_sync = false;
-#else
-    template<class ElemType> bool GPUSparseMatrix<ElemType>::do_sync = true;
-#endif
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
-    {
-        if (matrixFormat != MatrixFormat::matrixFormatSparseCSC && matrixFormat != MatrixFormat::matrixFormatSparseCSR &&
-            matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            throw std::logic_error("GPUSparseMatrix:  unsupported sparse matrix format");
-        }
-
-        m_computeDevice = (computeDevice == AUTOPLACEMATRIX) ? GPUMatrix<ElemType>::GetBestGPUDeviceId() : computeDevice; //current GPU device Id
-        m_numRows=0;  
-        m_numCols=0;
-        m_elemSizeAllocated = m_nz = 0; //Number of non-zero elements
-        m_totalBufferSizeAllocated = 0;
-        m_format = matrixFormat;
-        m_externalBuffer = false;
-        m_pArray=nullptr; 
-        m_matrixName=nullptr;
-
-        m_blockSize = 0;
-        m_blockVal = nullptr;
-        m_blockIds = nullptr;
-
-        m_expandedSize = 0;
-        m_rowToId = nullptr;
-        m_block2Id = nullptr;
-        m_block2UniqId = nullptr;
-
-        m_tempHostBuffer = nullptr;
-        m_tempHostBufferSize = 0;
-    }
-
-    template<class ElemType>    
-    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/, const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/)
-    {
-        ZeroInit(matrixFormat, computeDevice);
-        Resize(numRows, numCols, numNZ);
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/,
-        const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/)
-    {
-        ZeroInit(matrixFormat, computeDevice);
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUMatrix<ElemType>& deepCopy, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/)
-    {
-        ZeroInit(matrixFormat, deepCopy.GetComputeDeviceId());
-        if (!deepCopy.IsEmpty()) 
-            SetValue(deepCopy, matrixFormat);
-    }
-
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUSparseMatrix<ElemType>& deepCopy)
-    {
-        ZeroInit(deepCopy.GetFormat(), deepCopy.GetComputeDeviceId());
-        DeepCopy(deepCopy);
-    }
-
-    // PrepareDevice - Setup the correct cuda context for an operation
-    // deviceId - the device on which the operation will take place
-    //            defaults to -1, which means use matrices current device
-    template<class ElemType>
-    DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const
-    {
-        // if default value use current compute device
-        DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice;
-
-        Microsoft::MSR::CNTK::PrepareDevice(newId);
-        return newId;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
-    {
-        ChangeDeviceTo(deepCopy.m_computeDevice);
-        deepCopy.PrepareDevice();
-
-        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format);
-        m_nz = deepCopy.m_nz;
-        CUDACALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
-        CUDACALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
-        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
-
-        m_externalBuffer = false;
-        SetMatrixName(deepCopy.m_matrixName);
-
-        //TODO: to copy other varibles used only for class based LM
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
-    {
-        DeepCopy(deepCopy);
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCopy)
-    {
-        SetFormat(deepCopy.GetFormat());
-        if (deepCopy.IsEmpty())
-        {
-            Reset();
-            return;
-        }
-
-        if (deepCopy.GetFormat() == matrixFormatSparseCSR)
-        {
-            //we need to do conversion because CPUSparseMatrix uses CPUSPARSE_INDEX_TYPE for indexes while GPUSparseMatrix uses GPUSPARSE_INDEX_TYPE
-            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
-            {
-                SetMatrixFromCSRFormat((GPUSPARSE_INDEX_TYPE*)deepCopy.RowLocation(), (GPUSPARSE_INDEX_TYPE*)deepCopy.ColLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
-            }
-            else
-            {
-                GPUSPARSE_INDEX_TYPE * h_CSRRow = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(deepCopy.MajorIndexSize() + deepCopy.SecondaryIndexSize());
-                CopyBuffer(h_CSRRow, deepCopy.RowLocation(), deepCopy.SecondaryIndexCount());
-
-                GPUSPARSE_INDEX_TYPE *h_Col = h_CSRRow + deepCopy.SecondaryIndexCount();
-                CopyBuffer(h_Col, deepCopy.ColLocation(), deepCopy.MajorIndexCount());
-
-                SetMatrixFromCSRFormat(h_CSRRow, h_Col, deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
-            }
-        }
-        else if (deepCopy.GetFormat() == matrixFormatSparseCSC)
-        {
-            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
-            {
-                SetMatrixFromCSCFormat((GPUSPARSE_INDEX_TYPE*)deepCopy.ColLocation(), (GPUSPARSE_INDEX_TYPE*)deepCopy.RowLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
-            }
-            else
-            {
-                GPUSPARSE_INDEX_TYPE * h_CSCCol = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(deepCopy.MajorIndexSize() + deepCopy.SecondaryIndexSize());
-                CopyBuffer(h_CSCCol, deepCopy.ColLocation(), deepCopy.SecondaryIndexCount());
-
-                GPUSPARSE_INDEX_TYPE *h_Row = h_CSCCol + deepCopy.SecondaryIndexCount();
-                CopyBuffer(h_Row, deepCopy.RowLocation(), deepCopy.MajorIndexCount());
-
-                SetMatrixFromCSCFormat(h_CSCCol, h_Row, deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
-            }
-        }
-        else
-            NOT_IMPLEMENTED;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const
-    {
-        cpuSparseMatrix.SetFormat(GetFormat());
-        if (IsEmpty())
-        {
-            cpuSparseMatrix.Reset();
-            return;
-        }
-
-        if (this->GetFormat() == matrixFormatSparseCSR)
-        {
-            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
-            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
-            cpuSparseMatrix.SetNzCount(GetNumNZElements());
-
-            PrepareDevice();
-
-            if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE))
-            {
-                CUDACALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-                CUDACALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-            }
-            else
-            {
-                GPUSPARSE_INDEX_TYPE *h_CSRRow = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(RowSize());
-                CUDACALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-                CopyBuffer(cpuSparseMatrix.RowLocation(), h_CSRRow, SecondaryIndexCount());
-
-                GPUSPARSE_INDEX_TYPE *h_Col = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(ColSize());
-                CUDACALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-                CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
-            }
-
-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
-
-        }
-        else if (this->GetFormat() == matrixFormatSparseCSC)
-        {
-            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
-            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
-            cpuSparseMatrix.SetNzCount(GetNumNZElements());
-
-            PrepareDevice();
-            if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE))
-            {
-                CUDACALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-                CUDACALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-            }
-            else
-            {
-                GPUSPARSE_INDEX_TYPE *h_CSCCol = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(ColSize());
-                CUDACALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-                CopyBuffer(cpuSparseMatrix.ColLocation(), h_CSCCol, SecondaryIndexCount());
-
-                GPUSPARSE_INDEX_TYPE *h_Row = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(RowSize());
-                CUDACALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-                CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
-            }
-
-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
-        }
-        else
-            NOT_IMPLEMENTED;
-    }   
-
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType> & denseMatrix) const
-    {
-        if (IsEmpty())
-        {
-            denseMatrix.Resize(0, 0);
-            return;
-        }
-
-        PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-        cusparseMatDescr_t descr = 0;
-        CUSPARSECALL(cusparseCreateMatDescr(&descr));
-        cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
-
-        denseMatrix.Resize(m_numRows, m_numCols);
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        CUSPARSECALL(cusparseSetStream(cusparseHandle, t_stream));
-        if (m_format == MatrixFormat::matrixFormatSparseCSR)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
-            }
-        }
-        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
-            }
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        CUSPARSECALL(cusparseDestroy(cusparseHandle));
-
-        denseMatrix.SetMatrixName(m_matrixName);
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
-    {
-        if (IsEmpty())
-        {
-            outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
-            return;
-        }
-
-        MatrixFormat oldFormat = GetFormat();
-        if (oldFormat == newFormat)
-        {
-            outMatrix.SetValue(*this);
-            return;
-        }
-
-        PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        CUSPARSECALL(cusparseSetStream(cusparseHandle, t_stream));
-
-        outMatrix.ChangeDeviceTo(GetComputeDeviceId());
-        outMatrix.Resize(m_numRows, m_numCols, m_nz,newFormat);
-        outMatrix.SetNzCount(m_nz);
-
-        if (oldFormat == matrixFormatSparseCSR && newFormat == matrixFormatSparseCSC)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
-                    (float*)NzValues(), RowLocation(), ColLocation(), (float*)outMatrix.NzValues(),
-                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
-                    (double*)NzValues(), RowLocation(), ColLocation(), (double*)outMatrix.NzValues(),
-                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
-            }
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        CUSPARSECALL(cusparseDestroy(cusparseHandle));
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat)
-    {
-        if (IsEmpty())
-        {
-            SetFormat(newFormat);
-            return;
-        }
-
-        MatrixFormat oldFormat = GetFormat();
-        if (oldFormat == newFormat)
-            return;
-
-        GPUSparseMatrix<ElemType> tempMatrix(newFormat, GetComputeDeviceId());
-        ConvertToSparseFormat(newFormat, tempMatrix);
-
-        *this = std::move(tempMatrix);
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix() const
-    {
-        GPUMatrix<ElemType> res(GetComputeDeviceId());
-        if (!IsEmpty())
-            CopyToDenseMatrix(res);
-        return res;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ChangeDeviceTo(DEVICEID_TYPE to_id)
-    {
-        if (!OwnBuffer())
-            throw std::logic_error("Cannot change device on Managed external matrix");
-        if (to_id == CPUDEVICE)
-            throw std::logic_error("to_id must be valid GPU");
-        if (m_computeDevice == to_id)
-            return;
-
-        if (m_totalBufferSizeAllocated == 0)  //nothing to move
-        {
-            assert(m_pArray == nullptr);
-        }
-        else
-        {
-            PrepareDevice(to_id);
-            ElemType* d_dst = NULL;
-            CUDACALL(cudaMalloc((void**)&d_dst, m_totalBufferSizeAllocated));
-
-            // first try peer access
-            int canAccessPeer = false;
-            CUDACALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
-            if (canAccessPeer)
-            {
-                CUDACALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
-                CUDACALL(cudaMemcpyPeer(d_dst, to_id, m_pArray, m_computeDevice, m_totalBufferSizeAllocated));
-            }
-            else
-            {
-                // peer access didn't work, just copy normal
-                // make this more efficient by keeping some buffers available for each copy
-                ElemType* h_dst = NULL;
-                PrepareDevice();
-                CUDACALL(cudaMallocHost((void**)&h_dst, m_totalBufferSizeAllocated));
-                CUDACALL(cudaMemcpy(h_dst, m_pArray, m_totalBufferSizeAllocated, cudaMemcpyDeviceToHost));
-                PrepareDevice((DEVICEID_TYPE)to_id);
-                CUDACALL(cudaMemcpy(d_dst, h_dst, m_totalBufferSizeAllocated, cudaMemcpyHostToDevice));
-                CUDACALL(cudaFreeHost(h_dst));
-            }
-
-            PrepareDevice();
-            CUDACALL(cudaFree(m_pArray));
-            m_pArray = d_dst;
-        }
-
-        SetComputeDeviceId(PrepareDevice(to_id));
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix)
-    {
-        SetValue(denseMatrix, GetFormat());
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat)
-    {
-        if (matrixFormat != matrixFormatSparseCSR && matrixFormat != matrixFormatSparseCSC)
-        {
-            NOT_IMPLEMENTED;
-        }
-
-        PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-        cusparseMatDescr_t descr = 0;
-        CUSPARSECALL(cusparseCreateMatDescr(&descr));
-        cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
-
-        int numRows = (int)denseMatrix.GetNumRows(); //m
-        int numCols = (int)denseMatrix.GetNumCols(); //n
-
-        int *nnzPerRowOrCol = nullptr;
-        CUDACALL(cudaMalloc((void**)&nnzPerRowOrCol, sizeof(GPUSPARSE_INDEX_TYPE)*((matrixFormat&matrixFormatRowMajor) ? numRows : numCols)));
-
-        int nnzTotalDevHostPtr = -1;
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUSPARSECALL(cusparseSnnz(cusparseHandle, (matrixFormat&matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int)numRows, (int)numCols, descr,
-                reinterpret_cast<float*>(denseMatrix.BufferPointer()), (int)numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr));
-        }
-        else
-        {
-            CUSPARSECALL(cusparseDnnz(cusparseHandle, (matrixFormat&matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int)numRows, (int)numCols, descr,
-                reinterpret_cast<double*>(denseMatrix.BufferPointer()), (int)numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr));
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-
-        Resize(numRows, numCols, nnzTotalDevHostPtr, matrixFormat);
-        SetNzCount(nnzTotalDevHostPtr);
-
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        
-        if (m_format == MatrixFormat::matrixFormatSparseCSR)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseSdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
-            }
-        }
-        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseSdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
-            }
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        SetMatrixName(denseMatrix.GetMatrixName());
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
-    {
-        if (this != &deepCopy)
-        {
-            SetValue(deepCopy);
-        }
-        return *this;       
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
-    {
-        m_computeDevice=moveFrom.m_computeDevice;
-        m_numRows=moveFrom.m_numRows;  
-        m_numCols=moveFrom.m_numCols;
-        m_nz=moveFrom.m_nz; 
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
-        m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-        m_pArray = moveFrom.m_pArray;
-        m_format = moveFrom.m_format;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-        m_matrixName=moveFrom.m_matrixName;
-
-        m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
-        m_blockIds = moveFrom.m_blockIds;
-
-        m_expandedSize = moveFrom.m_expandedSize;
-        m_rowToId = moveFrom.m_rowToId;
-        m_block2Id = moveFrom.m_block2Id;
-        m_block2UniqId = moveFrom.m_block2UniqId;
-
-        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
-        moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);  //so that memory in moveFrom is not freeed
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<ElemType>&& moveFrom)
-    {
-        Clear();
-        m_computeDevice=moveFrom.m_computeDevice;
-        m_numRows=moveFrom.m_numRows;
-        m_numCols=moveFrom.m_numCols;
-        m_nz=moveFrom.m_nz;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
-        m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-        m_pArray = moveFrom.m_pArray;
-        m_format = moveFrom.m_format;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-
-        m_matrixName=moveFrom.m_matrixName;
-
-        m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
-        m_blockIds = moveFrom.m_blockIds;
-
-        m_expandedSize = moveFrom.m_expandedSize;
-        m_rowToId = moveFrom.m_rowToId;
-        m_block2Id = moveFrom.m_block2Id;
-        m_block2UniqId = moveFrom.m_block2UniqId;
-
-        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
-        moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
-
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
-    {
-        Clear();
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Clear()
-    {
-        if (m_matrixName!=nullptr) 
-        {
-            delete[] m_matrixName;
-            m_matrixName = NULL;
-        }
-
-        if(m_pArray != nullptr) 
-            CUDACALL(cudaFree(m_pArray));
-
-        if(m_blockVal != nullptr) 
-            CUDACALL(cudaFree(m_blockVal));
-        if(m_blockIds != nullptr) 
-            CUDACALL(cudaFree(m_blockIds));
-        if (m_rowToId != nullptr)
-            CUDACALL(cudaFree(m_rowToId));
-        if (m_block2Id != nullptr)
-            CUDACALL(cudaFree(m_block2Id));
-        if (m_block2UniqId != nullptr)
-            CUDACALL(cudaFree(m_block2UniqId));
-
-        if (m_tempHostBuffer != nullptr)
-            delete[] m_tempHostBuffer;
-
-        ZeroInit(m_format, m_computeDevice);
-    }
-
-    //ResizeAsAndCopyIndexFrom - Resize this sparse matrix to have the same element structure as the passed matrix
-    // a - sparse matrix whose structure we want to clone
-    // remark: this was done for element wise operations where the structure will be identical after an operation
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly /*= true*/)
-    {
-        Resize(a.m_numRows, a.m_numCols, a.m_nz, a.m_format, growOnly);
-        SetNzCount(a.m_nz);
-
-        CUDACALL(cudaMemcpy(MajorIndexLocation(), a.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
-        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), a.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
-    }
-
-    //-------------------------------------------------------------------------
-    // Start of new GPU Sparse Matrix code 
-    //-------------------------------------------------------------------------
-
-    template<class ElemType>
-    ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR) 
-        {
-            return m_pArray;
-        }  
-        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            return m_blockVal;
-        }
-        else
-            NOT_IMPLEMENTED;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
-    {
-        Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly);
-    }
-
-    //WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
-    //TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation 
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
-    {               
-        m_numRows = numRows;
-        m_numCols = numCols; 
-
-
-        if (matrixFormat == MatrixFormat::matrixFormatSparseCSC || matrixFormat == MatrixFormat::matrixFormatSparseCSR)
-        {
-            size_t bufferSizeNeeded = BufferSizeNeeded(numNZElemToReserve);
-            bool reallocate = (m_totalBufferSizeAllocated < bufferSizeNeeded || (!growOnly && m_totalBufferSizeAllocated > bufferSizeNeeded));
-
-            if (reallocate)
-            {
-                if (!OwnBuffer())
-                    throw logic_error("Cannot Resize since the buffer is managed externally.");
-
-                if (m_pArray != nullptr)
-                    CUDACALL(cudaFree(m_pArray));
-                if (m_rowToId != nullptr)
-                    CUDACALL(cudaFree(m_rowToId));
-                if (m_block2Id != nullptr)
-                    CUDACALL(cudaFree(m_block2Id));
-                if (m_block2UniqId != nullptr)
-                    CUDACALL(cudaFree(m_block2UniqId));
-
-                PrepareDevice();
-
-                CUDACALL(cudaMalloc((void **)&m_pArray, bufferSizeNeeded));
-                CUDACALL(cudaMalloc((void **)&m_rowToId, sizeof(size_t)*numNZElemToReserve));
-                CUDACALL(cudaMalloc((void **)&m_block2Id, sizeof(size_t)*numNZElemToReserve));
-                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*numNZElemToReserve));
-                m_totalBufferSizeAllocated = bufferSizeNeeded;
-                m_elemSizeAllocated = numNZElemToReserve;
-            }
-        } 
-        else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
-            {
-                if (m_blockVal != nullptr)
-                    CUDACALL(cudaFree(m_blockVal));
-                if (m_blockIds != nullptr)
-                    CUDACALL(cudaFree(m_blockIds));
-                if (m_block2UniqId != nullptr)
-                    CUDACALL(cudaFree(m_block2UniqId));  
-
-                PrepareDevice();
-                size_t newCompIndexSize = max(numRows, numCols) + 1;
-                CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
-                CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
-                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
-
-                m_elemSizeAllocated = numNZElemToReserve;
-            }
-        }
-        else
-            NOT_IMPLEMENTED;
-    }
-
-    //Reset matrix so it can be reused
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Reset()
-    {                
-        m_nz = 0;
-        m_blockSize = 0;
-    }
-    // copy features to GPU         
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
-        const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/)
-    {
-        SetComputeDeviceId(PrepareDevice(devId));
-
-        m_format = matrixFormatSparseCSR;
-        Resize(numRows, numCols, nz);
-        SetNzCount(nz);
-
-        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-        CUDACALL(cudaMemcpy(RowLocation(), h_CSRRow, RowSize(), kind));
-        CUDACALL(cudaMemcpy(ColLocation(), h_Col, ColSize(), kind));
-        CUDACALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
-    }
-
-    // NOTE: we should change this to just use a single buffer, and return pointers into it
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
-    {
-        if (h_CSRRow != nullptr || h_Col != nullptr || h_Val != nullptr)
-            throw std::logic_error("Passed pointers must be nullptr");
-        nz = GetNumNZElements();
-        numRows = GetNumRows();
-        numCols = GetNumCols();
-
-        if (IsEmpty())
-            return;
-        else
-        {
-            PrepareDevice();
-            h_Val = new ElemType[nz];
-            h_CSRRow = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
-            h_Col = new GPUSPARSE_INDEX_TYPE[nz];
-
-            CUDACALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-            CUDACALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-            CUDACALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
-        }
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-        const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/)
-    {
-        SetComputeDeviceId(PrepareDevice(devId));
-        m_format = matrixFormatSparseCSC;
-        Resize(numRows, numCols, nz);
-        SetNzCount(nz);
-
-        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-        CUDACALL(cudaMemcpy(RowLocation(), h_Row, RowSize(), kind));
-        CUDACALL(cudaMemcpy(ColLocation(), h_CSCCol, ColSize(), kind));
-        CUDACALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
-
-        map<size_t, size_t> indexer;
-        size_t *rowToId = (size_t*)ReserveTempHostBuffer(sizeof(size_t)*nz);
-
-        for (size_t i = 0; i < nz; i++)
-        {
-            size_t row = h_Row[i];                    
-            if (indexer.find(row) == indexer.end())
-            {
-                indexer[row] = indexer.size();
-            }
-            rowToId[i] = indexer[row];
-        }  
-
-        CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
-    }
-
-    // NOTE: we should change this to just use a single buffer, and return pointers into it
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
-    {
-        if (h_CSCCol != nullptr || h_Row != nullptr || h_Val != nullptr)
-            throw std::logic_error("Passed pointers must be nullptr");
-        nz = GetNumNZElements();
-        numRows = GetNumRows();
-        numCols = GetNumCols();
-
-        if (IsEmpty())
-            return;
-        else
-        {
-            PrepareDevice();
-            h_Val = new ElemType[nz];
-            h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
-            h_Row = new GPUSPARSE_INDEX_TYPE[nz];
-
-            CUDACALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
-            CUDACALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
-            CUDACALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
-        }
-    }
-
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize)
-    {
-        m_format = matrixFormatSparseCSC;
-        Resize(m_numRows, m_numCols, labelSize);
-        SetNzCount(labelSize);
-
-        m_expandedSize = expandedSize;
-        m_blockSize = blockSize;
-
-        PrepareDevice();
-
-        if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
-        {
-            CUDACALL(cudaMemcpy(MajorIndexLocation(), h_row, sizeof(GPUSPARSE_INDEX_TYPE)*labelSize, cudaMemcpyHostToDevice));
-        }
-        else
-        {
-            //convert from CPUSPARSE_INDEX_TYPE to GPUSPARSE_INDEX_TYPE
-            GPUSPARSE_INDEX_TYPE* pRow = (GPUSPARSE_INDEX_TYPE*)ReserveTempHostBuffer(sizeof(GPUSPARSE_INDEX_TYPE)*labelSize);
-            CopyBuffer(pRow, h_row, labelSize);
-            CUDACALL(cudaMemcpy(MajorIndexLocation(), pRow, sizeof(GPUSPARSE_INDEX_TYPE)*labelSize, cudaMemcpyHostToDevice));
-        }
-        CUDACALL(cudaMemcpy(m_block2Id, h_block2Id, sizeof(size_t)*labelSize, cudaMemcpyHostToDevice));
-        CUDACALL(cudaMemcpy(m_block2UniqId, h_block2UniqId, sizeof(size_t)*labelSize, cudaMemcpyHostToDevice));
-    }
-
-
-#pragma endregion Constructors and Destructor
-
-#pragma region Static BLAS Functions
-    
-    // dense X sparse = dense
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
-        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c)
-    {
-        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
-            throw std::runtime_error("MultiplyAndWeightedAdd: All matrices must be on the same GPU");
-
-        if (lhs.IsEmpty() || rhs.IsEmpty())
-            throw std::logic_error("MultiplyAndWeightedAdd:  one of the input matrix is empty.");
-
-        int m = transposeA ? (int)lhs.GetNumCols() : (int)lhs.GetNumRows();
-        int k = transposeA ? (int)lhs.GetNumRows() : (int)lhs.GetNumCols();
-        int l = transposeB ? (int)rhs.GetNumCols() : (int)rhs.GetNumRows();
-        int n = transposeB ? (int)rhs.GetNumRows() : (int)rhs.GetNumCols();
-
-        assert(m > 0 && k > 0 && l > 0 && n > 0);  //converting from size_t to int may cause overflow
-        assert(k == l);
-        if (k != l)
-        {
-            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match.");
-        }
-
-        if (c.GetNumRows() != m || c.GetNumCols() != n)
-        {
-            c.Resize(m, n);
-        }
-
-        c.PrepareDevice();
-        if (rhs.m_format == MatrixFormat::matrixFormatSparseCSC)
-        {
-            if (!transposeA && !transposeB)
-            {
-                int blocksPerGrid = (int)ceil(1.0*m*n / threadsPerBlock);
-                cudaEvent_t done = nullptr;
-                if (do_sync)    CUDACALL(cudaEventCreate(&done));
-                _denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
-                    m, //rowDense
-                    k,  //colDense = rowSparse
-                    n,   //colSparse
-                    alpha,
-                    reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
-                    reinterpret_cast<const ElemType*>(rhs.NzValues()),  //sparse nz values
-                    rhs.RowLocation(),
-                    rhs.ColLocation(),
-                    beta,
-                    reinterpret_cast<ElemType*> (c.BufferPointer())  //dense target
-                    );
-
-                if (do_sync)    CUDACALL(cudaEventRecord(done));
-                if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-                if (do_sync)    CUDACALL(cudaEventDestroy(done));
-            }
-            else
-            {
-                NOT_IMPLEMENTED;
-            }
-        }
-        else if (rhs.m_format == matrixFormatSparseCSR)
-        {
-            GPUSparseMatrix<ElemType> tempMatrix(matrixFormatSparseCSC, rhs.GetComputeDeviceId());
-            rhs.ConvertToSparseFormat(matrixFormatSparseCSC, tempMatrix);
-            MultiplyAndWeightedAdd(alpha, lhs, transposeA, tempMatrix, transposeB, beta, c);
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-
-    // backward pass from hidden layer to feature weight
-    // dense X sparse = sparse 
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, 
-        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, GPUSparseMatrix<ElemType>& c)
-    {
-        if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
-            throw std::runtime_error("GPUSparseMatrix::MultiplyAndAdd: All matrices must be on the same GPU");
-        
-        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
-        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
-        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
-        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
-
-        assert(m>0 && k>0 && l>0 && n>0); (void)m; (void)n;  //converting from size_t to int may cause overflow
-        assert (k == l);
-        if (k != l) 
-        {
-            throw std::invalid_argument("GPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match.");
-        }
-                
-        if (!transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else if (!transposeA && transposeB)
-        {   
-            if (rhs.GetFormat() != matrixFormatSparseCSC)
-                NOT_IMPLEMENTED;
-
-            c.SetFormat(matrixFormatSparseBlockCol);
-            c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
-            c.m_nz = m*c.m_blockSize;
-            c.Resize(m, n, c.m_nz);
-            CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
-            CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
-
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            int blocksPerGrid = rhs.GetNumNZElements();
-            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
-                alpha,
-                lhs.BufferPointer(),
-                m,
-                rhs.BufferPointer(),
-                rhs.RowLocation(),
-                rhs.m_rowToId,
-                c.m_blockVal, 
-                c.m_blockIds);
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        }
-        else if (transposeA && !transposeB)
-        {
-            NOT_IMPLEMENTED;
-        }
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    // used for gradients udpate
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
-    {
-        if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
-            throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");
-
-        if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow) 
-        {
-            size_t len = (lhs.m_format == matrixFormatSparseBlockCol) ? lhs.GetNumRows(): lhs.GetNumCols();
-            bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);
-
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            size_t blocksPerGrid = lhs.m_blockSize;
-            _scaleSparseAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
-                alpha,
-                blockCol,
-                lhs.m_blockVal,
-                lhs.m_blockIds,
-                len,
-                rhs.BufferPointer(),
-                rhs.GetNumRows());
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        } 
-        else 
-        {
-            ScaleAndAdd(alpha, lhs, 1, rhs, rhs);
-        }
-    }
-
-    // a: H x No: H is hidden layer size and No is mini-batch size
-    // weight: V x H, V is vocab size
-    // label: V x No
-    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
-    // idx2cls: V x 1, mapping from word to class id
-    // etp: V x No, stores predicted values
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
-        const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore)
-    {
-        int deviceId = a.GetComputeDeviceId();
-        if (weight.GetComputeDeviceId()!=deviceId || label.GetComputeDeviceId()!=deviceId || cls.GetComputeDeviceId()!=deviceId 
-            || idx2cls.GetComputeDeviceId()!=deviceId || etp.GetComputeDeviceId()!=deviceId )
-            throw std::runtime_error("GPUSparseMatrix:: ClassEntropy() All matrices must be on the same GPU");  
-
-        size_t nC = cls.GetNumCols();
-        size_t nV = label.GetNumRows() - nC;
-
-        if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows())
-            throw std::logic_error("ClassEntropy: check matrix dimension");        
-        
-        //allocate enough memory
-        if(etp.m_elemSizeAllocated < label.m_expandedSize) 
-        {
-            etp.Resize(etp.GetNumRows(), etp.GetNumCols(), label.m_expandedSize);
-        }
-        etp.m_nz = label.m_expandedSize;
-        CUDACALL(cudaMemset(etp.m_pArray,0,sizeof(ElemType)*(etp.m_nz)));
-        entropyScore.SetValue((ElemType)0);     
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        size_t blocksPerGrid = label.m_expandedSize;
-
-        //_computePrediction<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
-        _computePrediction<ElemType><<<blocksPerGrid, 20>>>(
-            idx2cls.GetNumRows(),
-            a.BufferPointer(),
-            a.GetNumRows(),
-            weight.BufferPointer(),
-            weight.GetNumRows(),
-            label.m_nz,
-            label.MajorIndexLocation(),
-            label.m_block2Id,
-            cls.BufferPointer(),
-            idx2cls.BufferPointer(),            
-            etp.m_pArray,
-            etp.MajorIndexLocation(),
-            etp.SecondaryIndexLocation());
-
-        blocksPerGrid = label.m_nz;
-        _normalizePrediction<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
-            label.m_nz,
-            label.m_expandedSize,
-            label.MajorIndexLocation(),
-            label.m_block2Id, 
-            etp.MajorIndexLocation(),
-            etp.m_pArray,
-            entropyScore.BufferPointer());
-
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-   }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ClassEntropyError(GPUSparseMatrix<ElemType>& a)
-    {
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-
-        int N = a.m_nz;
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); 
-
-        _computePredictionError<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
-            a.m_pArray,
-            N);
-
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd)
-    {
-        int deviceId = error.GetComputeDeviceId();
-        if (weight.GetComputeDeviceId()!=deviceId || grd.GetComputeDeviceId()!=deviceId )
-            throw std::runtime_error("GPUSparseMatrix::ClassEntropyGradientOfInput() All matrices must be on the same GPU");
-
-        grd.SetValue((ElemType)0); 
-        cudaEvent_t done = nullptr; 
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-
-        size_t blocksPerGrid = grd.GetNumElements();
-        //_computeGradientOfInput<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
-        _computeGradientOfInput<ElemType><<<blocksPerGrid, 20>>>(
-            error.m_pArray,
-            error.MajorIndexLocation(),
-            error.SecondaryIndexLocation(),
-            weight.BufferPointer(),
-            weight.GetNumRows(),
-            grd.BufferPointer(), 
-            grd.GetNumRows());
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-    
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd)
-    {
-        int deviceId = error.GetComputeDeviceId();
-        if (input.GetComputeDeviceId()!=deviceId || label.GetComputeDeviceId()!=deviceId || cls.GetComputeDeviceId()!=deviceId  || idx2cls.GetComputeDeviceId()!=deviceId || grd.GetComputeDeviceId()!=deviceId )
-            throw std::runtime_error("GPUSparseMatrix::ClassEntropyGradientOfWeight() All matrices must be on the same GPU");
-
-        grd.SetFormat(matrixFormatSparseBlockRow);  
-        size_t nz = label.m_blockSize * grd.GetNumCols();        
-        //allocate enough memory
-        if(grd.m_elemSizeAllocated < nz) 
-        {
-            grd.Resize(grd.GetNumRows(), grd.GetNumCols(), nz);
-        }
-        grd.m_blockSize = label.m_blockSize;      
-        grd.m_nz = nz;
-        CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
-        CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
-
-        cudaEvent_t done = nullptr;  
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-
-        size_t blocksPerGrid = error.m_nz;
-        _computeGradientOfWeight<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
-            error.m_pArray,
-            error.MajorIndexLocation(),
-            error.SecondaryIndexLocation(),
-            input.GetNumCols(),
-            idx2cls.GetNumRows(),
-            label.MajorIndexLocation(),
-            label.m_block2UniqId,
-            cls.BufferPointer(),
-            idx2cls.BufferPointer(),              
-            input.BufferPointer(),
-            input.GetNumRows(),
-            grd.m_blockVal, 
-            grd.m_blockIds);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
-    {
-        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow ||
-            m_format == matrixFormatSparseCSR || m_format == matrixFormatSparseCSC)
-        {
-            long N=(long)GetNumNZElements();
-            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            ElemType * values = NzValues();
-            if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
-                values = m_blockVal;
-            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        } 
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-        return *this;
-    } 
-
-    // normal update for smoothed gradients c and current gradients (this)
-    template<class ElemType> 
-    void GPUSparseMatrix<ElemType>::NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum)
-    {
-        if (c.IsEmpty())
-        {
-            c.Resize(GetNumRows(), GetNumCols());
-            c.SetValue(0.0);
-        }
-
-        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) 
-        {
-            size_t blocksPerGrid = m_blockSize;
-            bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
-            size_t len = isBlockCol ? GetNumRows(): GetNumCols();
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            _normalGrad<ElemType><<<blocksPerGrid,threadsPerBlock>>>(
-                isBlockCol,
-                len,
-                momentum,
-                m_blockIds,
-                m_blockVal,
-                c.BufferPointer(),
-                c.GetNumRows());                        
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        } 
-        else 
-        {
-            NOT_IMPLEMENTED;
-        }
-    }
-
-    //-------------------------------------------------------------------------
-    // End of new GPU Sparse Matrix code 
-    //-------------------------------------------------------------------------
-
-    //sparse X dense = dense
-    template<class ElemType>
-    void  GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA, 
-        const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
-    {
-        if (a.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        if (transposeD)
-            NOT_IMPLEMENTED;
-
-        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId()||(b.GetComputeDeviceId()!=a.GetComputeDeviceId()))
-            throw std::runtime_error("MultiplyAndWeightedAdd: All matrices must be on the same GPU");
-
-        a.PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-        cusparseMatDescr_t descr = 0;
-        CUSPARSECALL(cusparseCreateMatDescr(&descr));
-        cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
-        cusparseOperation_t oper = transposeA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-        int m = (int)a.GetNumRows();
-        int n = (int)b.GetNumCols();
-        assert(n==(int)c.GetNumCols());
-        int k = (int)a.GetNumCols();
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUSPARSECALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.NzValues()),
-                a.RowLocation(), a.ColLocation(), reinterpret_cast <float*>(b.BufferPointer()),
-                (int)b.GetNumRows(),reinterpret_cast <float*>(&beta),reinterpret_cast <float*>(c.BufferPointer()),(int)c.GetNumRows()));
-        }
-        else 
-        {
-            CUSPARSECALL(cusparseDcsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <double*>(&alpha),descr,reinterpret_cast <const double*>(a.NzValues()),
-                a.RowLocation(), a.ColLocation(), reinterpret_cast <double*>(b.BufferPointer()),
-                (int)b.GetNumRows(),reinterpret_cast <double*>(&beta),reinterpret_cast <double*>(c.BufferPointer()),(int)c.GetNumRows()));
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        CUSPARSECALL(cusparseDestroy(cusparseHandle));        
-    }
-       
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C)
-    {
-        C.Resize(S.GetNumRows(), D.GetNumCols());
-
-        MultiplyAndWeightedAdd(1,S,false,D,false,0,C);
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C)
-    {   
-        C.Resize(S.GetNumCols(),D.GetNumRows());
-
-        MultiplyAndWeightedAdd(1,D,false,S,false,0,C);     
-    }
-
-    // ElemCountFromBufferSize - Return the elemCountAllocated for a particular buffersize
-    // totalBufferSize - total buffer we have to use
-    // return: size of allocated elements/index slots available
-    template<class ElemType>
-    size_t GPUSparseMatrix<ElemType>::ElemCountFromBufferSize(const size_t totalBufferSize) const
-    {
-        size_t elemSizeAllocated;
-        if (m_format & matrixFormatCompressed)
-        {
-            elemSizeAllocated = (totalBufferSize - SecondaryIndexSize()) / (sizeof(GPUSPARSE_INDEX_TYPE)+sizeof(ElemType));
-        }
-        else // uncompressed COO format
-        {
-            elemSizeAllocated = totalBufferSize / (2 * sizeof(GPUSPARSE_INDEX_TYPE)+sizeof(ElemType));
-        }
-        return elemSizeAllocated;
-    }
-
-    template<class ElemType>
-    size_t GPUSparseMatrix<ElemType>::ElemCountFromBufferSize() const
-    {
-        return ElemCountFromBufferSize(m_totalBufferSizeAllocated);
-    }
-
-    // PrepareBuffer - Get the dimensions start buffer, computes the starting row/column of each value
-    // m - rows in the source
-    // n - cols in the source
-    // canReuseBuffer - target matrix can be reused for temporary space
-    // func - function to call to count elements in the result (returns count, and fills csrRowPtr array)
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func)
-    {
-        if (this->m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        PrepareDevice();
-
-        GPUSPARSE_INDEX_TYPE* csrRowPtrC=nullptr;
-        GPUSparseMatrix<ElemType>& c = *this;
-        size_t cSize = c.BufferSizeAllocated();
-        size_t rowBufferRequired = (m + 1)*sizeof(GPUSPARSE_INDEX_TYPE);
-        bool allocatedBuffer = false;
-
-        // do we have enough memory to store just the row buffer?
-        if (cSize >= rowBufferRequired && c.NzValues() != nullptr && canReuseBuffer)
-        {
-            csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.NzValues();
-        }
-        else
-        {
-            CUDACALL(cudaMalloc((void **)&csrRowPtrC, rowBufferRequired));
-            allocatedBuffer = true;
-        }
-
-        // get the non-zero count from the function (and 
-        size_t nnzC = func(csrRowPtrC);
-
-        // now we know the number of Non-zeros in the result set, set the output size
-        c.Resize(m, n, nnzC);
-        c.m_nz = nnzC;
-
-        CUDACALL(cudaMemcpy(c.SecondaryIndexLocation(),csrRowPtrC,c.SecondaryIndexSize(),cudaMemcpyDeviceToDevice));
-
-        // if we allocated the buffer, free it here
-        if (allocatedBuffer)
-            CUDACALL(cudaFree(csrRowPtrC));
-    }
-
-    // Multiply - multiply one spares matrix by another sparse matrix
-    // S1 - first sparse matrix
-    // transposeS1 - transpose first matrix?
-    // S2 - second sparse matrix
-    // transposeS2 - tanspose second matrix?
-    // c - result matrix
-    // NOTE: if c has enough space allocated, it will be reused, otherwise it will be freed and a new memory block used
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &c)
-    {
-        if (S1.m_format != matrixFormatSparseCSR || S2.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        if (S1.GetComputeDeviceId()!=S2.GetComputeDeviceId())
-            throw std::runtime_error("Sparse matrix multiply: both matrices must be on the same device");
-
-        S1.PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-        cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0;
-        CUSPARSECALL(cusparseCreateMatDescr(&descrA)); CUSPARSECALL(cusparseCreateMatDescr(&descrB)); CUSPARSECALL(cusparseCreateMatDescr(&descrC));        
-        cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrC,CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
-        cusparseOperation_t operA = transposeS1 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t operB = transposeS2 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-        int m = int(transposeS1 ? S1.GetNumCols() : S1.GetNumRows());
-        int n = int(transposeS2 ? S2.GetNumRows() : S2.GetNumCols());
-        int k = int(transposeS1 ? S1.GetNumRows() : S1.GetNumCols());
-        int l = int(transposeS2 ? S2.GetNumCols() : S2.GetNumRows());
-        if (k!=l)
-            throw std::runtime_error("Sparse matrix multiply: dimensionality mismatch");
-
-        int nnzA = (int)S1.GetNumNZElements();
-        int nnzB = (int)S2.GetNumNZElements();
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        //Step 1 
-        c.PrepareBuffer(m, n, false, // false means we cannot reuse the "c" buffer if it exists for temporaries
-            [&](GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t
-        {
-            int nnzTotal = -1; 
-            CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB,
-                S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
-            return nnzTotal;
-        });
-
-
-        //Step 2
-        if (sizeof(float)==sizeof(ElemType))
-        {
-            CUSPARSECALL(cusparseScsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const float*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
-                descrB,nnzB,(const float*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
-                descrC,(float*)c.NzValues(),c.RowLocation(),c.ColLocation()));
-        }
-        else
-        {
-            CUSPARSECALL(cusparseDcsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const double*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
-                descrB,nnzB,(const double*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
-                descrC,(double*)c.NzValues(),c.RowLocation(),c.ColLocation()));
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        cusparseDestroy(cusparseHandle);   
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB)
-    {
-        Multiply(a,transposeA,b,transposeB,*this);
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c)
-    {
-        if (a.m_format != matrixFormatSparseCSR || b.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        if (a.GetNumCols() != b.GetNumCols() || a.GetNumRows() != b.GetNumRows())
-            throw std::runtime_error("Dimensions mismatch in ScaleAndAdd");
-        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId())
-            throw std::runtime_error("ScaleAndAdd: matrices must be on the same device");
-
-        int m = (int)a.GetNumRows();
-        int n = (int)a.GetNumCols();
-        int nnzA = (int)a.GetNumNZElements();
-        int nnzB = (int)b.GetNumNZElements();
-
-        a.PrepareDevice();
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-        cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0;
-        CUSPARSECALL(cusparseCreateMatDescr(&descrA)); CUSPARSECALL(cusparseCreateMatDescr(&descrB)); CUSPARSECALL(cusparseCreateMatDescr(&descrC));
-        cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        //Step 1 
-        bool inOutParameter = (&b == &c);
-        c.PrepareBuffer(m, n, !inOutParameter, [&] (GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t
-        {
-            int nnzTotal = -1;
-            CUSPARSECALL(cusparseXcsrgeamNnz(cusparseHandle,m,n,descrA,nnzA,a.RowLocation(),a.ColLocation(),descrB,nnzB,b.RowLocation(),b.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
-            return nnzTotal;
-        });
-
-        //Step 2
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUSPARSECALL(cusparseScsrgeam(cusparseHandle,m,n,reinterpret_cast <const float*>(&alpha),descrA,nnzA,reinterpret_cast <const float*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
-                reinterpret_cast <const float*>(&beta),descrB,nnzB,reinterpret_cast <const float*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <float*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
-        }
-        else
-        {
-            CUSPARSECALL(cusparseDcsrgeam(cusparseHandle,m,n,reinterpret_cast <const double*>(&alpha),descrA,nnzA,reinterpret_cast <const double*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
-                reinterpret_cast <const double*>(&beta),descrB,nnzB,reinterpret_cast <const double*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <double*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        cusparseDestroy(cusparseHandle);   
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        if (a.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        if (a.GetNumRows() != b.GetNumRows() || a.GetNumRows() != c.GetNumRows() || a.GetNumCols() != b.GetNumCols() || a.GetNumCols() != c.GetNumCols())
-            throw std::logic_error("ScaleAndAdd: dimension mismatch");
-        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId()||a.GetComputeDeviceId()!=c.GetComputeDeviceId())
-            throw std::runtime_error("ScaleAndAdd: matrices must be on the same device");
-        b.PrepareDevice();
-        //copy b to c
-        CUDACALL(cudaMemcpy(c.BufferPointer(),b.BufferPointer(),sizeof(ElemType)*b.GetNumElements(),cudaMemcpyDeviceToDevice));
-        if (beta!=1)
-        {
-            c*=beta;
-        }
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        long M=(long)a.GetNumRows();
-        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
-        _sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
-    {
-        ScaleAndAdd(beta,b,alpha,a,c);
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a)
-    {
-        if (a.IsEmpty())
-            return;
-
-        long N=(long)a.GetNumNZElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _scaleArray<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c)
-    {
-        if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
-        {
-            throw std::invalid_argument("All matrices must be on the same GPU");
-        }
-        else 
-        {
-            if (a.IsEmpty())
-                throw std::logic_error("ElementWisePower:  The input matrix a is empty.");
-
-            c.ResizeAsAndCopyIndexFrom(a);
-
-            cudaEvent_t done = nullptr;
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            a.PrepareDevice();
-            long N=(long)a.GetNumNZElements();
-            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),c.NzValues(),N);             
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        }
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.m_format != matrixFormatSparseCSR && a.m_format != matrixFormatSparseCSC)
-            NOT_IMPLEMENTED;
-
-        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId())
-            throw std::runtime_error("a and b must be on the same device");
-
-        int m = (int)a.GetNumRows();
-        int n = (int)a.GetNumCols();
-        int nnz = (int)a.GetNumNZElements();
-
-        ElemType* cscValA = nullptr;
-        GPUSPARSE_INDEX_TYPE* cscRowIndA = nullptr;
-        GPUSPARSE_INDEX_TYPE* cscColPtrA = nullptr;
-
-        cudaEvent_t done = nullptr;
-        cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        cusparseHandle_t cusparseHandle = 0;
-
-        if (a.m_format == matrixFormatSparseCSR)         //need to put a in ColumnMajor format
-        {
-            a.PrepareDevice();
-            CUDACALL(cudaMalloc((void **)&cscValA, nnz*sizeof(ElemType)));
-            CUDACALL(cudaMalloc((void **)&cscRowIndA, nnz*sizeof(GPUSPARSE_INDEX_TYPE)));
-            CUDACALL(cudaMalloc((void **)&cscColPtrA, (n + 1)*sizeof(GPUSPARSE_INDEX_TYPE)));
-
-            CUSPARSECALL(cusparseCreate(&cusparseHandle));
-            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
-            }
-            if (do_sync)    CUDACALL(cudaEventRecord(done));
-            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-            if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        }
-        else if (a.m_format == matrixFormatSparseCSC)
-        {
-            cscValA = (ElemType*)a.NzValues();
-            cscRowIndA = a.RowLocation();
-            cscColPtrA = a.ColLocation();
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-        //Given sparse matrix in column major format, calculate indices for corresponding sparse vector
-        GPUSPARSE_INDEX_TYPE* vectArray=nullptr;
-        CUDACALL(cudaMalloc((void**)&vectArray,sizeof(GPUSPARSE_INDEX_TYPE)*a.m_nz));
-        long M=n;
-        long N=m;
-        //GPUSPARSE_INDEX_TYPE* h_vectArray= new int[a.m_nz];
-        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);   
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _getSparseVectorRepresntationForCSCMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        CUDACALL(cudaFree(cscRowIndA));
-        CUDACALL(cudaFree(cscColPtrA));
-        //CUDACALL(cudaMemcpy(h_vectArray,vectArray,sizeof(GPUSPARSE_INDEX_TYPE)*a.m_nz,cudaMemcpyDeviceToHost));    
-
-        //Actual dot product
-        ElemType res=0;
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUSPARSECALL(cusparseSdoti(cusparseHandle,(int)a.m_nz,reinterpret_cast<float*>(cscValA),vectArray,
-                reinterpret_cast<float*>(b.BufferPointer()),
-                reinterpret_cast<float*>(&res),idxBase));
-        }
-        else
-        {
-            CUSPARSECALL(cusparseDdoti(cusparseHandle,(int)a.m_nz,reinterpret_cast<double*>(cscValA),vectArray,
-                reinterpret_cast<double*>(b.BufferPointer()),
-                reinterpret_cast<double*>(&res),idxBase));
-        }       
-        CUDACALL(cudaFree(vectArray));
-        CUDACALL(cudaFree(cscValA));
-        CUSPARSECALL(cusparseDestroy(cusparseHandle));   
-        return res;        
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
-    {
-        return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(b,a);
-    }
-
-    template<class ElemType>
-    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
-        const ElemType threshold)
-    {
-        if (a.GetNumNZElements()!=b.GetNumNZElements() || a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            return false;
-
-        if (a.m_format != b.m_format)
-            NOT_IMPLEMENTED;
-
-        a.PrepareDevice();
-        long *res = new long[3];
-        res[0]=1;
-        res[1]=1;
-        res[2]=1;
-        long *d_res = nullptr;
-        CUDACALL(cudaMalloc((void**)&d_res,sizeof(long)*3)); 
-        CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));
-
-        int blocksPerGrid =(int)ceil(1.0*a.GetNumNZElements()/threadsPerBlock); 
-        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),b.NzValues(),(long)a.GetNumNZElements(),threshold,d_res);        
-        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNumNZElements(),(int)threshold,d_res+1);
-        blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock); 
-        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
-
-        CUDACALL(cudaMemcpy(res,d_res,sizeof(long)*3,cudaMemcpyDeviceToHost));        
-        if (res[0]*res[1]*res[2]==1)
-            return true;
-        else
-            return false;
-    }
-
-    template<class ElemType>
-    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
-        const ElemType threshold)
-    {
-        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            return false;
-        GPUSparseMatrix<ElemType> c(b.GetFormat(), b.GetComputeDeviceId());
-        c.SetValue(a);
-        return AreEqual(c,b,threshold);
-    }
-
-    template<class ElemType>
-    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, 
-        const ElemType threshold)
-    {
-        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-            return false;
-        GPUSparseMatrix<ElemType> c(a.GetFormat(),a.GetComputeDeviceId());
-        c.SetValue(b);
-        return AreEqual(a,c,threshold);
-    }
-
-    template<class ElemType>
-    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold) const
-    {
-        return AreEqual(*this,a,threshold);
-    }
-
-    template<class ElemType>
-    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold) const
-    {
-        return AreEqual(*this,a,threshold);
-    }
-#pragma endregion Static BLAS Functions
-
-#pragma region Member BLAS Functions
-
-    template<class ElemType>
-    DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
-    {
-        // for externally managed memory the CUDA context will have the current device
-        if (!OwnBuffer())
-        {
-            DEVICEID_TYPE devId;
-            CUDACALL(cudaGetDevice(&devId));
-            return devId;
-        }
-        else
-            return m_computeDevice;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
-    {
-        if (a.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        if (a.GetNumRows()!=b.GetNumRows()||a.GetNumCols()!=b.GetNumCols())
-            throw std::logic_error("ElementProductOf: matrix dimensions mismatch");
-
-        b.PrepareDevice();        
-        GPUMatrix<ElemType> c(b.GetNumRows(),b.GetNumCols(),b.GetComputeDeviceId());
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        long M=(long)a.GetNumRows();
-        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
-        _sparseCSRElemMulDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),a.RowLocation(),a.ColLocation(),b.BufferPointer(),c.BufferPointer(),M);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return c;
-    }
-
-    template<class ElemType>
-    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
-    {
-        return GPUSparseMatrix<ElemType>::ElementProductOf(b,a);        
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator+ (const GPUSparseMatrix<ElemType>& a) const
-    {
-        GPUSparseMatrix<ElemType> res(GetFormat(), GetComputeDeviceId());
-        GPUSparseMatrix<ElemType>::ScaleAndAdd(1,*this,1,a,res);
-        return res;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator- (const GPUSparseMatrix<ElemType>& a) const
-    {
-        GPUSparseMatrix<ElemType> res(GetFormat(), GetComputeDeviceId());
-        GPUSparseMatrix<ElemType>::ScaleAndAdd(1, *this, -1, a, res);
-        return res;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
-    {
-        GPUSparseMatrix<ElemType>& us = *this;
-        ElementWisePower(alpha, us, us);
-        return us;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^ (ElemType alpha) const
-    {
-        GPUSparseMatrix<ElemType> c(GetFormat(), GetComputeDeviceId());
-        ElementWisePower(alpha, *this, c);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator*=(ElemType alpha)
-    {
-        GPUSparseMatrix<ElemType>& us = *this;
-        if (alpha!=1)            
-            Scale(alpha,us);
-        return us;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator* (ElemType alpha) const
-    {
-        GPUSparseMatrix<ElemType> c(*this);
-        if (alpha!=1)
-            Scale(alpha, c);
-        return c;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power)
-    {
-        ElementWisePower(power, a, *this);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::Transpose() const
-    {
-        int m = (int)GetNumRows();
-        int n = (int)GetNumCols();
-        int nnz = (int)GetNumNZElements();
-        cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-
-        assert(GetFormat()&matrixFormatCompressed); // for now this only supports compressed formats
-        PrepareDevice();
-        GPUSparseMatrix c(GetFormat(), GetComputeDeviceId());
-        c.Resize(n, m, nnz, GetFormat());
-        c.m_nz = nnz;
-
-        cusparseHandle_t cusparseHandle = 0;
-        CUSPARSECALL(cusparseCreate(&cusparseHandle));
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        if (m_format == MatrixFormat::matrixFormatSparseCSR)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
-                    reinterpret_cast<float*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
-                    reinterpret_cast<double*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
-            }
-        }
-        else if (m_format == matrixFormatSparseCSC)
-        {
-            if (sizeof(ElemType) == sizeof(float))
-            {
-                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
-                    reinterpret_cast<float*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
-            }
-            else
-            {
-                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
-                    reinterpret_cast<double*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
-            }
-        }
-        else
-        {
-            NOT_IMPLEMENTED;
-        }
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        CUSPARSECALL(cusparseDestroy(cusparseHandle));        
-        return c;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTransposeOf(const GPUSparseMatrix<ElemType>& a)
-    {
-        if (this == &a)
-            throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose.");
-
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTransposeOf: Matrix a is empty.");
-
-        *this = a.Transpose();
-        return *this;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::InplaceTranspose()
-    {
-        if (IsEmpty())
-            return;
-        // transfer converted block over to this pointer
-        *this = std::move(Transpose());
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("SumOfAbsElements: Matrix is empty");
-
-        cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            float res=0;
-            cublasSasum(cuHandle,(int)GetNumNZElements(),reinterpret_cast<float*>(m_pArray),1,&res);
-            return res;
-        }
-        else
-        {
-            double res=0;
-            cublasDasum(cuHandle,(int)GetNumNZElements(),reinterpret_cast<double*>(m_pArray),1,&res);
-            return ElemType(res);
-        }         
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("SumOfElements: Matrix is empty");
-
-        PrepareDevice();
-        ElemType* d_sum = nullptr;
-        ElemType h_sum;
-        CUDACALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum<ElemType><<<1,1024>>>(NzValues(),d_sum,(LONG64)GetNumNZElements());
-        CUDACALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDACALL(cudaFree(d_sum));               
-        return h_sum;        
-    }
-
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const 
-    {
-        if (IsEmpty())
-            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
-
-        ElemType* d_sum = nullptr;
-        ElemType h_sum=0;
-        CUDACALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSum2<ElemType><<<1,1024>>>(m_pArray,d_sum,(int)GetNumNZElements());
-        CUDACALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDACALL(cudaFree(d_sum));               
-        if (sizeof(ElemType)==sizeof(float))
-            return (ElemType)sqrtf((float)h_sum);
-        else
-            return (ElemType)sqrt((double)h_sum);
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("MatrixNorm1: Matrix is empty.");
-
-        ElemType* d_maxAbs = nullptr;
-        ElemType h_maxAbs=0;
-        CUDACALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType)));
-        //WARNING: THIS kernel is not the most efficient way!
-        _reductionMatrixNormInf<ElemType><<<1,1024>>>(m_pArray,d_maxAbs,(int)GetNumNZElements());
-        CUDACALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost));
-        CUDACALL(cudaFree(d_maxAbs));               
-        if (sizeof(ElemType)==sizeof(float))
-            return h_maxAbs;
-        else
-            return h_maxAbs; 
-    }
-
-    template<class ElemType>
-    ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
-    {
-        if (IsEmpty())
-            throw std::logic_error("MatrixNorm1: Matrix is empty.");
-        return SumOfAbsElements();              
-    }
-
-#pragma endregion Member BLAS Functions
-
-#pragma region Other Functions
-
-    template<class ElemType>    
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse ()
-    {
-        if (IsEmpty())
-            throw std::logic_error("ElementInverse: Matrix is empty.");
-
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        return ElementInverse();
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
-    {
-        performInplaceFunction(0);                    
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceSigmoid();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLinearRectifierDerivative()
-    {
-        performInplaceFunction(6);                    
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceLinearRectifierDerivative();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTanh()
-    {
-        performInplaceFunction(1);
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTanhOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceTanh();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSqrt()
-    {
-        performInplaceFunction(2);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceSqrt();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
-    {
-        performInplaceFunction(3);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceExp();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
-    {
-        performInplaceFunction(4);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceLog();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceAbs()
-    {
-        performInplaceFunction(5);        
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf (const GPUSparseMatrix<ElemType>& a)
-    {
-        SetValue(a);
-        InplaceAbs();
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty.");
-
-        if (this!=&a)
-        {
-            //Resize(a.GetNumRows(), a.GetNumCols());           
-            ResizeAsAndCopyIndexFrom(a);  
-        }
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.NzValues(),threshold,N);                        
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }   
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("InplaceTruncateTop: Matrix is empty.");
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
-    {
-        if (a.IsEmpty())
-            throw std::logic_error("AssignTruncateTopOf: Matrix a is empty.");
-
-        if (this!=&a)
-        {
-            ResizeAsAndCopyIndexFrom(a);
-        }
-
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.NzValues(),threshold,N);                        
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-    template<class ElemType>
-    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
-    {
-        if (IsEmpty())
-            throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty.");
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-        return *this;
-    }
-
-
-#pragma endregion
-
-#pragma region Helper Functions
-
-    //outBuffer should be allocated to be >= size by the caller 
-    template<class ElemType>
-    template <class OutType, class InType>
-    void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size)
-    {
-#pragma omp parallel for
-        for (size_t i = 0; i<(size & ~3); i += 4)
-        {
-            outBuffer[i] = inBuffer[i];
-            outBuffer[i + 1] = inBuffer[i + 1];
-            outBuffer[i + 2] = inBuffer[i + 2];
-            outBuffer[i + 3] = inBuffer[i + 3];
-        }
-        //handle remaining stuffs
-        for (size_t i = size & ~3; i<size; i++)
-        {
-            outBuffer[i] = inBuffer[i];
-        }
-    }
-
-    template<class ElemType>
-    void* GPUSparseMatrix<ElemType>::ReserveTempHostBuffer(const size_t sizeInByte) const
-    {
-        if (m_tempHostBufferSize < sizeInByte)
-        {
-            delete[] m_tempHostBuffer;
-            m_tempHostBuffer = new byte[sizeInByte];
-            m_tempHostBufferSize = sizeInByte;
-        }
-        return (void*)m_tempHostBuffer;
-    }
-
-    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::performInplaceFunction(int kind)
-    {        
-        long N=(long)GetNumNZElements();
-        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDACALL(cudaEventCreate(&done));
-        switch (kind)
-        {
-        case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-            break;
-        case 6:
-            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
-        } 
-        if (do_sync)    CUDACALL(cudaEventRecord(done));
-        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDACALL(cudaEventDestroy(done));
-    }
-
- 
-
-#pragma endregion Helper Functions
-
-    template class GPUSparseMatrix<float>; 
-    template class GPUSparseMatrix<double>;    
-
-    template <class ElemType>
-    MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
-    {
-        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        size_t elsize;
-        stream>>elsize;
-        if (sizeof(ElemType)!=elsize)
-            throw std::runtime_error("Template argument size doesn't match those in file");
-        std::wstring matrixName;
-
-        // now prepare this header to receive the data being read
-        size_t nz, colnum, rownum;
-        int format;
-
-        // read in the header information
-        stream>>matrixName>>format>>nz>>colnum>>rownum;
-
-        us.m_format = (MatrixFormat)format;
-        if (us.m_format != matrixFormatSparseCSC && us.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        us.Resize(rownum, colnum, nz);
-        us.SetNzCount(nz);
-
-        if (nz > 0)
-        {
-            size_t compressedSize = (us.m_format == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1;
-            ElemType* dataBuffer = new ElemType[nz];
-            GPUSPARSE_INDEX_TYPE * unCompressedIndex = new GPUSPARSE_INDEX_TYPE[nz];
-            GPUSPARSE_INDEX_TYPE * compressedIndex = new GPUSPARSE_INDEX_TYPE[compressedSize];
-
-            // read in the sparse matrix info
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream >> dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                size_t val;
-                stream >> val;
-                unCompressedIndex[i] = val;
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                size_t val;
-                stream >> val;
-                compressedIndex[i] = val;
-            }
-
-            if (us.m_format == matrixFormatSparseCSC)
-                us.SetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, rownum, colnum);
-            else if (us.m_format == matrixFormatSparseCSR)
-                us.SetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, rownum, colnum);
-
-            delete[] dataBuffer;
-            delete[] unCompressedIndex;
-            delete[] compressedIndex;
-        }
-
-        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-        us.SetMatrixName(matrixName.c_str());
-
-        return stream;
-    }
-
-    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<float>& us);
-    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<double>& us);
-
-    template <class ElemType>
-    MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemType>& us)
-    {
-        if (us.m_format != matrixFormatSparseCSC && us.m_format != matrixFormatSparseCSR)
-            NOT_IMPLEMENTED;
-
-        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
-        stream<<sizeof(ElemType);
-        if (us.GetMatrixName()==nullptr)
-        {
-            std::wstring s(L"nnmatrix");
-            stream<<s;
-        }
-        else
-        {
-            stream<<us.GetMatrixName();
-        }
-
-        size_t nz = us.GetNumNZElements(), numRows=us.GetNumRows(), numCols=us.GetNumCols();
-        size_t compressedSize = us.SecondaryIndexCount();
-        int format = us.GetFormat();
-
-        stream << format << nz << numCols << numRows;
-
-        if (nz > 0)
-        {
-            ElemType *dataBuffer = nullptr;
-            GPUSPARSE_INDEX_TYPE* compressedIndex = nullptr;
-            GPUSPARSE_INDEX_TYPE* unCompressedIndex = nullptr;
-
-            if (us.m_format == matrixFormatSparseCSC)
-                us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
-            else if (us.m_format == matrixFormatSparseCSR)
-                us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
-            else
-                NOT_IMPLEMENTED;
-
-            for (size_t i = 0; i < nz; ++i)
-            {
-                stream << dataBuffer[i];
-            }
-            for (size_t i = 0; i < nz; ++i)
-            {
-                size_t val = unCompressedIndex[i];
-                stream << val;
-            }
-            for (size_t i = 0; i < compressedSize; ++i)
-            {
-                size_t val = compressedIndex[i];
-                stream << val;
-            }
-
-            delete[] dataBuffer;
-            delete[] unCompressedIndex;
-            delete[] compressedIndex;
-        }
-
-        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
-
-        return stream;
-    }
-
-    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<float>& us);
-    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<double>& us);
-
-}}}
-
-#endif  // CPUONLY
+//
+// <copyright file="GPUSparseMatrix.cu" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include "BestGpu.h"
+
+#ifndef CPUONLY
+
+#include "GPUSparseMatrix.h"
+#include "GPUMatrix.h"
+#include <cuda_runtime.h>
+#include <cusparse_v2.h>
+#include "cublas_v2.h"
+#include "GPUMatrixCUDAKernels.cu"
+#include <functional>
+#include "CommonMatrix.h"
+#include <iostream> // for cout
+#include <assert.h>
+
+#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
+
+#ifdef    _WIN32
+// thread local storage to access the current stream, initalize to default stream
+extern __declspec (thread)
+#endif
+cudaStream_t t_stream;
+
+
+void CUDACALL(cudaError_t x) 
+{
+    if(x!=cudaSuccess) 
+    { 
+        const char* errmsg = cudaGetErrorString(x);
+        std::cerr<< "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl;
+
+        throw std::runtime_error(errmsg);
+    }    
+}
+
+void CUSPARSECALL(cusparseStatus_t x) 
+{
+    if(x!= CUSPARSE_STATUS_SUCCESS) 
+    {         
+        std::cerr << "!!!!!!!!CUSPARSE EXCEPTION: " << std::endl;
+        throw std::runtime_error("CUSPARSE EXCEPTION");
+    }    
+}
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+#pragma region Constructors and Destructor
+
+#ifdef NO_SYNC
+    template<class ElemType> bool GPUSparseMatrix<ElemType>::do_sync = false;
+#else
+    template<class ElemType> bool GPUSparseMatrix<ElemType>::do_sync = true;
+#endif
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
+    {
+        if (matrixFormat != MatrixFormat::matrixFormatSparseCSC && matrixFormat != MatrixFormat::matrixFormatSparseCSR &&
+            matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
+        {
+            throw std::logic_error("GPUSparseMatrix:  unsupported sparse matrix format");
+        }
+
+        m_computeDevice = (computeDevice == AUTOPLACEMATRIX) ? GPUMatrix<ElemType>::GetBestGPUDeviceId() : computeDevice; //current GPU device Id
+        m_numRows=0;  
+        m_numCols=0;
+        m_elemSizeAllocated = m_nz = 0; //Number of non-zero elements
+        m_totalBufferSizeAllocated = 0;
+        m_format = matrixFormat;
+        m_externalBuffer = false;
+        m_pArray=nullptr; 
+        m_matrixName=nullptr;
+
+        m_blockSize = 0;
+        m_blockIds = nullptr;
+
+        m_expandedSize = 0;
+        m_rowToId = nullptr;
+        m_block2Id = nullptr;
+        m_block2UniqId = nullptr;
+
+        m_tempHostBuffer = nullptr;
+        m_tempHostBufferSize = 0;
+    }
+
+    template<class ElemType>    
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/, const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/)
+    {
+        ZeroInit(matrixFormat, computeDevice);
+        Resize(numRows, numCols, numNZ);
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/,
+        const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/)
+    {
+        ZeroInit(matrixFormat, computeDevice);
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUMatrix<ElemType>& deepCopy, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/)
+    {
+        ZeroInit(matrixFormat, deepCopy.GetComputeDeviceId());
+        if (!deepCopy.IsEmpty()) 
+            SetValue(deepCopy, matrixFormat);
+    }
+
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUSparseMatrix<ElemType>& deepCopy)
+    {
+        ZeroInit(deepCopy.GetFormat(), deepCopy.GetComputeDeviceId());
+        DeepCopy(deepCopy);
+    }
+
+    // PrepareDevice - Setup the correct cuda context for an operation
+    // deviceId - the device on which the operation will take place
+    //            defaults to -1, which means use matrices current device
+    template<class ElemType>
+    DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const
+    {
+        // if default value use current compute device
+        DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice;
+
+        Microsoft::MSR::CNTK::PrepareDevice(newId);
+        return newId;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
+    {
+        ChangeDeviceTo(deepCopy.m_computeDevice);
+        deepCopy.PrepareDevice();
+
+        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format);
+        m_nz = deepCopy.m_nz;
+        CUDACALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
+        CUDACALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
+        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
+
+        m_externalBuffer = false;
+        SetMatrixName(deepCopy.m_matrixName);
+
+        //TODO: to copy other varibles used only for class based LM
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
+    {
+        DeepCopy(deepCopy);
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCopy)
+    {
+        SetFormat(deepCopy.GetFormat());
+        if (deepCopy.IsEmpty())
+        {
+            Reset();
+            return;
+        }
+
+        if (deepCopy.GetFormat() == matrixFormatSparseCSR)
+        {
+            //we need to do conversion because CPUSparseMatrix uses CPUSPARSE_INDEX_TYPE for indexes while GPUSparseMatrix uses GPUSPARSE_INDEX_TYPE
+            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
+            {
+                SetMatrixFromCSRFormat((GPUSPARSE_INDEX_TYPE*)deepCopy.RowLocation(), (GPUSPARSE_INDEX_TYPE*)deepCopy.ColLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
+            }
+            else
+            {
+                GPUSPARSE_INDEX_TYPE * h_CSRRow = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(deepCopy.MajorIndexSize() + deepCopy.SecondaryIndexSize());
+                CopyBuffer(h_CSRRow, deepCopy.RowLocation(), deepCopy.SecondaryIndexCount());
+
+                GPUSPARSE_INDEX_TYPE *h_Col = h_CSRRow + deepCopy.SecondaryIndexCount();
+                CopyBuffer(h_Col, deepCopy.ColLocation(), deepCopy.MajorIndexCount());
+
+                SetMatrixFromCSRFormat(h_CSRRow, h_Col, deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
+            }
+        }
+        else if (deepCopy.GetFormat() == matrixFormatSparseCSC)
+        {
+            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
+            {
+                SetMatrixFromCSCFormat((GPUSPARSE_INDEX_TYPE*)deepCopy.ColLocation(), (GPUSPARSE_INDEX_TYPE*)deepCopy.RowLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
+            }
+            else
+            {
+                GPUSPARSE_INDEX_TYPE * h_CSCCol = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(deepCopy.MajorIndexSize() + deepCopy.SecondaryIndexSize());
+                CopyBuffer(h_CSCCol, deepCopy.ColLocation(), deepCopy.SecondaryIndexCount());
+
+                GPUSPARSE_INDEX_TYPE *h_Row = h_CSCCol + deepCopy.SecondaryIndexCount();
+                CopyBuffer(h_Row, deepCopy.RowLocation(), deepCopy.MajorIndexCount());
+
+                SetMatrixFromCSCFormat(h_CSCCol, h_Row, deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
+            }
+        }
+        else
+            NOT_IMPLEMENTED;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const
+    {
+        cpuSparseMatrix.SetFormat(GetFormat());
+        if (IsEmpty())
+        {
+            cpuSparseMatrix.Reset();
+            return;
+        }
+
+        if (this->GetFormat() == matrixFormatSparseCSR)
+        {
+            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
+            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
+            cpuSparseMatrix.SetNzCount(GetNumNZElements());
+
+            PrepareDevice();
+
+            if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE))
+            {
+                CUDACALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+                CUDACALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+            }
+            else
+            {
+                GPUSPARSE_INDEX_TYPE *h_CSRRow = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(RowSize());
+                CUDACALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+                CopyBuffer(cpuSparseMatrix.RowLocation(), h_CSRRow, SecondaryIndexCount());
+
+                GPUSPARSE_INDEX_TYPE *h_Col = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(ColSize());
+                CUDACALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+                CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
+            }
+
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+
+        }
+        else if (this->GetFormat() == matrixFormatSparseCSC)
+        {
+            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
+            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
+            cpuSparseMatrix.SetNzCount(GetNumNZElements());
+
+            PrepareDevice();
+            if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE))
+            {
+                CUDACALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+                CUDACALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+            }
+            else
+            {
+                GPUSPARSE_INDEX_TYPE *h_CSCCol = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(ColSize());
+                CUDACALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+                CopyBuffer(cpuSparseMatrix.ColLocation(), h_CSCCol, SecondaryIndexCount());
+
+                GPUSPARSE_INDEX_TYPE *h_Row = (GPUSPARSE_INDEX_TYPE *)ReserveTempHostBuffer(RowSize());
+                CUDACALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+                CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
+            }
+
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+        }
+        else
+            NOT_IMPLEMENTED;
+    }   
+
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType> & denseMatrix) const
+    {
+        if (IsEmpty())
+        {
+            denseMatrix.Resize(0, 0);
+            return;
+        }
+
+        PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+        cusparseMatDescr_t descr = 0;
+        CUSPARSECALL(cusparseCreateMatDescr(&descr));
+        cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+
+        denseMatrix.Resize(m_numRows, m_numCols);
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        CUSPARSECALL(cusparseSetStream(cusparseHandle, t_stream));
+        if (m_format == MatrixFormat::matrixFormatSparseCSR)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
+            }
+        }
+        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
+            }
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        CUSPARSECALL(cusparseDestroy(cusparseHandle));
+
+        denseMatrix.SetMatrixName(m_matrixName);
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
+    {
+        if (IsEmpty())
+        {
+            outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
+            return;
+        }
+
+        MatrixFormat oldFormat = GetFormat();
+        if (oldFormat == newFormat)
+        {
+            outMatrix.SetValue(*this);
+            return;
+        }
+
+        PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        CUSPARSECALL(cusparseSetStream(cusparseHandle, t_stream));
+
+        outMatrix.ChangeDeviceTo(GetComputeDeviceId());
+        outMatrix.Resize(m_numRows, m_numCols, m_nz,newFormat);
+        outMatrix.SetNzCount(m_nz);
+
+        if (oldFormat == matrixFormatSparseCSR && newFormat == matrixFormatSparseCSC)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
+                    (float*)NzValues(), RowLocation(), ColLocation(), (float*)outMatrix.NzValues(),
+                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
+                    (double*)NzValues(), RowLocation(), ColLocation(), (double*)outMatrix.NzValues(),
+                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
+            }
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        CUSPARSECALL(cusparseDestroy(cusparseHandle));
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat)
+    {
+        if (IsEmpty())
+        {
+            SetFormat(newFormat);
+            return;
+        }
+
+        MatrixFormat oldFormat = GetFormat();
+        if (oldFormat == newFormat)
+            return;
+
+        GPUSparseMatrix<ElemType> tempMatrix(newFormat, GetComputeDeviceId());
+        ConvertToSparseFormat(newFormat, tempMatrix);
+
+        *this = std::move(tempMatrix);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix() const
+    {
+        GPUMatrix<ElemType> res(GetComputeDeviceId());
+        if (!IsEmpty())
+            CopyToDenseMatrix(res);
+        return res;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ChangeDeviceTo(DEVICEID_TYPE to_id)
+    {
+        if (!OwnBuffer())
+            throw std::logic_error("Cannot change device on Managed external matrix");
+        if (to_id == CPUDEVICE)
+            throw std::logic_error("to_id must be valid GPU");
+        if (m_computeDevice == to_id)
+            return;
+
+        if (m_totalBufferSizeAllocated == 0)  //nothing to move
+        {
+            assert(m_pArray == nullptr);
+        }
+        else
+        {
+            PrepareDevice(to_id);
+            ElemType* d_dst = NULL;
+            CUDACALL(cudaMalloc((void**)&d_dst, m_totalBufferSizeAllocated));
+
+            // first try peer access
+            int canAccessPeer = false;
+            CUDACALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
+            if (canAccessPeer)
+            {
+                CUDACALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
+                CUDACALL(cudaMemcpyPeer(d_dst, to_id, m_pArray, m_computeDevice, m_totalBufferSizeAllocated));
+            }
+            else
+            {
+                // peer access didn't work, just copy normal
+                // make this more efficient by keeping some buffers available for each copy
+                ElemType* h_dst = NULL;
+                PrepareDevice();
+                CUDACALL(cudaMallocHost((void**)&h_dst, m_totalBufferSizeAllocated));
+                CUDACALL(cudaMemcpy(h_dst, m_pArray, m_totalBufferSizeAllocated, cudaMemcpyDeviceToHost));
+                PrepareDevice((DEVICEID_TYPE)to_id);
+                CUDACALL(cudaMemcpy(d_dst, h_dst, m_totalBufferSizeAllocated, cudaMemcpyHostToDevice));
+                CUDACALL(cudaFreeHost(h_dst));
+            }
+
+            PrepareDevice();
+            CUDACALL(cudaFree(m_pArray));
+            m_pArray = d_dst;
+        }
+
+        SetComputeDeviceId(PrepareDevice(to_id));
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix)
+    {
+        SetValue(denseMatrix, GetFormat());
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat)
+    {
+        if (matrixFormat != matrixFormatSparseCSR && matrixFormat != matrixFormatSparseCSC)
+        {
+            NOT_IMPLEMENTED;
+        }
+
+        PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+        cusparseMatDescr_t descr = 0;
+        CUSPARSECALL(cusparseCreateMatDescr(&descr));
+        cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+
+        int numRows = (int)denseMatrix.GetNumRows(); //m
+        int numCols = (int)denseMatrix.GetNumCols(); //n
+
+        int *nnzPerRowOrCol = nullptr;
+        CUDACALL(cudaMalloc((void**)&nnzPerRowOrCol, sizeof(GPUSPARSE_INDEX_TYPE)*((matrixFormat&matrixFormatRowMajor) ? numRows : numCols)));
+
+        int nnzTotalDevHostPtr = -1;
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CUSPARSECALL(cusparseSnnz(cusparseHandle, (matrixFormat&matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int)numRows, (int)numCols, descr,
+                reinterpret_cast<float*>(denseMatrix.BufferPointer()), (int)numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr));
+        }
+        else
+        {
+            CUSPARSECALL(cusparseDnnz(cusparseHandle, (matrixFormat&matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int)numRows, (int)numCols, descr,
+                reinterpret_cast<double*>(denseMatrix.BufferPointer()), (int)numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr));
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+
+        Resize(numRows, numCols, nnzTotalDevHostPtr, matrixFormat);
+        SetNzCount(nnzTotalDevHostPtr);
+
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        
+        if (m_format == MatrixFormat::matrixFormatSparseCSR)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseSdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
+            }
+        }
+        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseSdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
+            }
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        SetMatrixName(denseMatrix.GetMatrixName());
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
+    {
+        if (this != &deepCopy)
+        {
+            SetValue(deepCopy);
+        }
+        return *this;       
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
+    {
+        m_computeDevice=moveFrom.m_computeDevice;
+        m_numRows=moveFrom.m_numRows;  
+        m_numCols=moveFrom.m_numCols;
+        m_nz=moveFrom.m_nz; 
+        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
+        m_pArray = moveFrom.m_pArray;
+        m_format = moveFrom.m_format;
+        m_externalBuffer = moveFrom.m_externalBuffer;
+        m_matrixName=moveFrom.m_matrixName;
+
+        m_blockSize = moveFrom.m_blockSize;
+        m_blockIds = moveFrom.m_blockIds;
+
+        m_expandedSize = moveFrom.m_expandedSize;
+        m_rowToId = moveFrom.m_rowToId;
+        m_block2Id = moveFrom.m_block2Id;
+        m_block2UniqId = moveFrom.m_block2UniqId;
+
+        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
+        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
+
+        moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);  //so that memory in moveFrom is not freeed
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<ElemType>&& moveFrom)
+    {
+        Clear();
+        m_computeDevice=moveFrom.m_computeDevice;
+        m_numRows=moveFrom.m_numRows;
+        m_numCols=moveFrom.m_numCols;
+        m_nz=moveFrom.m_nz;
+        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
+        m_pArray = moveFrom.m_pArray;
+        m_format = moveFrom.m_format;
+        m_externalBuffer = moveFrom.m_externalBuffer;
+
+        m_matrixName=moveFrom.m_matrixName;
+
+        m_blockSize = moveFrom.m_blockSize;
+        m_blockIds = moveFrom.m_blockIds;
+
+        m_expandedSize = moveFrom.m_expandedSize;
+        m_rowToId = moveFrom.m_rowToId;
+        m_block2Id = moveFrom.m_block2Id;
+        m_block2UniqId = moveFrom.m_block2UniqId;
+
+        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
+        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
+
+        moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
+
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
+    {
+        Clear();
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Clear()
+    {
+        if (m_matrixName!=nullptr) 
+        {
+            delete[] m_matrixName;
+            m_matrixName = NULL;
+        }
+
+        if(m_pArray != nullptr) 
+            CUDACALL(cudaFree(m_pArray));
+
+        if(m_blockIds != nullptr) 
+            CUDACALL(cudaFree(m_blockIds));
+        if (m_rowToId != nullptr)
+            CUDACALL(cudaFree(m_rowToId));
+        if (m_block2Id != nullptr)
+            CUDACALL(cudaFree(m_block2Id));
+        if (m_block2UniqId != nullptr)
+            CUDACALL(cudaFree(m_block2UniqId));
+
+        if (m_tempHostBuffer != nullptr)
+            delete[] m_tempHostBuffer;
+
+        ZeroInit(m_format, m_computeDevice);
+    }
+
+    //ResizeAsAndCopyIndexFrom - Resize this sparse matrix to have the same element structure as the passed matrix
+    // a - sparse matrix whose structure we want to clone
+    // remark: this was done for element wise operations where the structure will be identical after an operation
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly /*= true*/)
+    {
+        Resize(a.m_numRows, a.m_numCols, a.m_nz, a.m_format, growOnly);
+        SetNzCount(a.m_nz);
+
+        CUDACALL(cudaMemcpy(MajorIndexLocation(), a.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
+        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), a.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
+    }
+
+    //-------------------------------------------------------------------------
+    // Start of new GPU Sparse Matrix code 
+    //-------------------------------------------------------------------------
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
+    {
+        Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly);
+    }
+
+    //WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
+    //TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation 
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
+    {               
+        m_numRows = numRows;
+        m_numCols = numCols; 
+
+
+        if (matrixFormat == MatrixFormat::matrixFormatSparseCSC || matrixFormat == MatrixFormat::matrixFormatSparseCSR)
+        {
+            size_t bufferSizeNeeded = BufferSizeNeeded(numNZElemToReserve);
+            bool reallocate = (m_totalBufferSizeAllocated < bufferSizeNeeded || (!growOnly && m_totalBufferSizeAllocated > bufferSizeNeeded));
+
+            if (reallocate)
+            {
+                if (!OwnBuffer())
+                    throw logic_error("Cannot Resize since the buffer is managed externally.");
+
+                if (m_pArray != nullptr)
+                    CUDACALL(cudaFree(m_pArray));
+                if (m_rowToId != nullptr)
+                    CUDACALL(cudaFree(m_rowToId));
+                if (m_block2Id != nullptr)
+                    CUDACALL(cudaFree(m_block2Id));
+                if (m_block2UniqId != nullptr)
+                    CUDACALL(cudaFree(m_block2UniqId));
+
+                PrepareDevice();
+
+                CUDACALL(cudaMalloc((void **)&m_pArray, bufferSizeNeeded));
+                CUDACALL(cudaMalloc((void **)&m_rowToId, sizeof(size_t)*numNZElemToReserve));
+                CUDACALL(cudaMalloc((void **)&m_block2Id, sizeof(size_t)*numNZElemToReserve));
+                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*numNZElemToReserve));
+                m_totalBufferSizeAllocated = bufferSizeNeeded;
+                m_elemSizeAllocated = numNZElemToReserve;
+            }
+            else
+            {
+                m_elemSizeAllocated = ElemCountFromBufferSize();
+            }
+        } 
+        else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
+        {
+            if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
+            {
+                if (m_pArray != nullptr)
+                    CUDACALL(cudaFree(m_pArray));
+                if (m_blockIds != nullptr)
+                    CUDACALL(cudaFree(m_blockIds));
+                if (m_block2UniqId != nullptr)
+                    CUDACALL(cudaFree(m_block2UniqId));  
+
+                PrepareDevice();
+                size_t newCompIndexSize = max(numRows, numCols) + 1;
+                CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
+                CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
+                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
+
+                m_elemSizeAllocated = numNZElemToReserve;
+            }
+        }
+        else
+            NOT_IMPLEMENTED;
+    }
+
+    //Reset matrix so it can be reused
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Reset()
+    {                
+        m_nz = 0;
+        m_blockSize = 0;
+    }
+    // copy features to GPU         
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
+        const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/)
+    {
+        SetComputeDeviceId(PrepareDevice(devId));
+
+        m_format = matrixFormatSparseCSR;
+        Resize(numRows, numCols, nz);
+        SetNzCount(nz);
+
+        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
+        CUDACALL(cudaMemcpy(RowLocation(), h_CSRRow, RowSize(), kind));
+        CUDACALL(cudaMemcpy(ColLocation(), h_Col, ColSize(), kind));
+        CUDACALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
+    }
+
+    // NOTE: we should change this to just use a single buffer, and return pointers into it
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
+    {
+        if (h_CSRRow != nullptr || h_Col != nullptr || h_Val != nullptr)
+            throw std::logic_error("Passed pointers must be nullptr");
+        nz = GetNumNZElements();
+        numRows = GetNumRows();
+        numCols = GetNumCols();
+
+        if (IsEmpty())
+            return;
+        else
+        {
+            PrepareDevice();
+            h_Val = new ElemType[nz];
+            h_CSRRow = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
+            h_Col = new GPUSPARSE_INDEX_TYPE[nz];
+
+            CUDACALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+        }
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
+        const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/)
+    {
+        SetComputeDeviceId(PrepareDevice(devId));
+        m_format = matrixFormatSparseCSC;
+        Resize(numRows, numCols, nz);
+        SetNzCount(nz);
+
+        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
+        CUDACALL(cudaMemcpy(RowLocation(), h_Row, RowSize(), kind));
+        CUDACALL(cudaMemcpy(ColLocation(), h_CSCCol, ColSize(), kind));
+        CUDACALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
+
+        map<size_t, size_t> indexer;
+        size_t *rowToId = (size_t*)ReserveTempHostBuffer(sizeof(size_t)*nz);
+
+        for (size_t i = 0; i < nz; i++)
+        {
+            size_t row = h_Row[i];                    
+            if (indexer.find(row) == indexer.end())
+            {
+                indexer[row] = indexer.size();
+            }
+            rowToId[i] = indexer[row];
+        }  
+        m_blockSize = indexer.size();
+        CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
+    }
+
+    // NOTE: we should change this to just use a single buffer, and return pointers into it
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
+    {
+        if (h_CSCCol != nullptr || h_Row != nullptr || h_Val != nullptr)
+            throw std::logic_error("Passed pointers must be nullptr");
+        nz = GetNumNZElements();
+        numRows = GetNumRows();
+        numCols = GetNumCols();
+
+        if (IsEmpty())
+            return;
+        else
+        {
+            PrepareDevice();
+            h_Val = new ElemType[nz];
+            h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
+            h_Row = new GPUSPARSE_INDEX_TYPE[nz];
+
+            CUDACALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+        }
+    }
+
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize)
+    {
+        m_format = matrixFormatSparseCSC;
+        Resize(m_numRows, m_numCols, labelSize);
+        SetNzCount(labelSize);
+
+        m_expandedSize = expandedSize;
+        m_blockSize = blockSize;
+
+        PrepareDevice();
+
+        if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
+        {
+            CUDACALL(cudaMemcpy(MajorIndexLocation(), h_row, sizeof(GPUSPARSE_INDEX_TYPE)*labelSize, cudaMemcpyHostToDevice));
+        }
+        else
+        {
+            //convert from CPUSPARSE_INDEX_TYPE to GPUSPARSE_INDEX_TYPE
+            GPUSPARSE_INDEX_TYPE* pRow = (GPUSPARSE_INDEX_TYPE*)ReserveTempHostBuffer(sizeof(GPUSPARSE_INDEX_TYPE)*labelSize);
+            CopyBuffer(pRow, h_row, labelSize);
+            CUDACALL(cudaMemcpy(MajorIndexLocation(), pRow, sizeof(GPUSPARSE_INDEX_TYPE)*labelSize, cudaMemcpyHostToDevice));
+        }
+        CUDACALL(cudaMemcpy(m_block2Id, h_block2Id, sizeof(size_t)*labelSize, cudaMemcpyHostToDevice));
+        CUDACALL(cudaMemcpy(m_block2UniqId, h_block2UniqId, sizeof(size_t)*labelSize, cudaMemcpyHostToDevice));
+    }
+
+
+#pragma endregion Constructors and Destructor
+
+#pragma region Static BLAS Functions
+    
+    // dense X sparse = dense
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
+        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c)
+    {
+        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
+            throw std::runtime_error("MultiplyAndWeightedAdd: All matrices must be on the same GPU");
+
+        if (lhs.IsEmpty() || rhs.IsEmpty())
+            throw std::logic_error("MultiplyAndWeightedAdd:  one of the input matrix is empty.");
+
+        int m = transposeA ? (int)lhs.GetNumCols() : (int)lhs.GetNumRows();
+        int k = transposeA ? (int)lhs.GetNumRows() : (int)lhs.GetNumCols();
+        int l = transposeB ? (int)rhs.GetNumCols() : (int)rhs.GetNumRows();
+        int n = transposeB ? (int)rhs.GetNumRows() : (int)rhs.GetNumCols();
+
+        assert(m > 0 && k > 0 && l > 0 && n > 0);  //converting from size_t to int may cause overflow
+        assert(k == l);
+        if (k != l)
+        {
+            throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match.");
+        }
+
+        if (c.GetNumRows() != m || c.GetNumCols() != n)
+        {
+            c.Resize(m, n);
+        }
+
+        c.PrepareDevice();
+        if (rhs.m_format == MatrixFormat::matrixFormatSparseCSC)
+        {
+            if (!transposeA && !transposeB)
+            {
+                int blocksPerGrid = (int)ceil(1.0*m*n / threadsPerBlock);
+                cudaEvent_t done = nullptr;
+                if (do_sync)    CUDACALL(cudaEventCreate(&done));
+                _denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
+                    m, //rowDense
+                    n,   //colSparse
+                    alpha,
+                    reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
+                    reinterpret_cast<const ElemType*>(rhs.NzValues()),  //sparse nz values
+                    rhs.RowLocation(),
+                    rhs.ColLocation(),
+                    beta,
+                    reinterpret_cast<ElemType*> (c.BufferPointer())  //dense target
+                    );
+
+                if (do_sync)    CUDACALL(cudaEventRecord(done));
+                if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+                if (do_sync)    CUDACALL(cudaEventDestroy(done));
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+        else if (rhs.m_format == matrixFormatSparseCSR)
+        {
+            GPUSparseMatrix<ElemType> tempMatrix(matrixFormatSparseCSC, rhs.GetComputeDeviceId());
+            rhs.ConvertToSparseFormat(matrixFormatSparseCSC, tempMatrix);
+            MultiplyAndWeightedAdd(alpha, lhs, transposeA, tempMatrix, transposeB, beta, c);
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+    }
+
+
+    // backward pass from hidden layer to feature weight
+    // dense X sparse = sparse 
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, 
+        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, GPUSparseMatrix<ElemType>& c)
+    {
+        if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
+            throw std::runtime_error("GPUSparseMatrix::MultiplyAndAdd: All matrices must be on the same GPU");
+        
+        int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows();
+        int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols();
+        int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows();
+        int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols();
+
+        assert(m>0 && k>0 && l>0 && n>0); (void)m; (void)n;  //converting from size_t to int may cause overflow
+        assert (k == l);
+        if (k != l) 
+        {
+            throw std::invalid_argument("GPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match.");
+        }
+                
+        if (!transposeA && !transposeB)
+        {
+            NOT_IMPLEMENTED;
+        }
+        else if (!transposeA && transposeB)
+        {   
+            if (rhs.GetFormat() != matrixFormatSparseCSC)
+                NOT_IMPLEMENTED;
+
+            lhs.PrepareDevice();
+
+            c.SetFormat(matrixFormatSparseBlockCol);
+            //c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
+            c.m_blockSize = rhs.m_blockSize;
+            c.m_nz = m*c.m_blockSize;
+            c.Resize(m, n, c.m_nz);
+            CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
+            CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
+
+            LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
+                    alpha,
+                    lhs.BufferPointer(),
+                    m,
+                    l,
+                    rhs.NzValues(),
+                    rhs.RowLocation(),
+                    rhs.ColLocation(),
+                    rhs.m_rowToId,
+                    c.NzValues(),
+                    c.m_blockIds);
+
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        }
+        else if (transposeA && !transposeB)
+        {
+            NOT_IMPLEMENTED;
+        }
+        else 
+        {
+            NOT_IMPLEMENTED;
+        }
+    }
+
+    // used for gradients udpate
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
+    {
+        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols())
+            throw std::logic_error("ScaleAndAdd: dimension mismatch");
+
+        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId())
+            throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");
+
+        if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow) 
+        {
+            bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);
+
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            LONG64 N = (LONG64)lhs.GetNumNZElements(); 
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+            _scaleSparseBlockAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
+                alpha,
+                blockCol,
+                lhs.GetNumRows(),
+                lhs.GetNumCols(),
+                lhs.m_blockSize,
+                lhs.NzValues(),
+                lhs.m_blockIds,
+                rhs.BufferPointer());
+
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        } 
+        else 
+        {
+            ScaleAndAdd(alpha, lhs, 1, rhs, rhs);
+        }
+    }
+
+    // a: H x No: H is hidden layer size and No is mini-batch size
+    // weight: V x H, V is vocab size
+    // label: V x No
+    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
+    // idx2cls: V x 1, mapping from word to class id
+    // etp: V x No, stores predicted values
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
+        const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore)
+    {
+        int deviceId = a.GetComputeDeviceId();
+        if (weight.GetComputeDeviceId()!=deviceId || label.GetComputeDeviceId()!=deviceId || cls.GetComputeDeviceId()!=deviceId 
+            || idx2cls.GetComputeDeviceId()!=deviceId || etp.GetComputeDeviceId()!=deviceId )
+            throw std::runtime_error("GPUSparseMatrix:: ClassEntropy() All matrices must be on the same GPU");  
+
+        size_t nC = cls.GetNumCols();
+        size_t nV = label.GetNumRows() - nC;
+
+        if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows())
+            throw std::logic_error("ClassEntropy: check matrix dimension");        
+        
+        //allocate enough memory
+        if(etp.m_elemSizeAllocated < label.m_expandedSize) 
+        {
+            etp.Resize(etp.GetNumRows(), etp.GetNumCols(), label.m_expandedSize);
+        }
+        etp.m_nz = label.m_expandedSize;
+        CUDACALL(cudaMemset(etp.m_pArray,0,sizeof(ElemType)*(etp.m_nz)));
+        entropyScore.SetValue((ElemType)0);     
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        size_t blocksPerGrid = label.m_expandedSize;
+
+        //_computePrediction<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
+        _computePrediction<ElemType><<<blocksPerGrid, 20>>>(
+            idx2cls.GetNumRows(),
+            a.BufferPointer(),
+            a.GetNumRows(),
+            weight.BufferPointer(),
+            weight.GetNumRows(),
+            label.m_nz,
+            label.MajorIndexLocation(),
+            label.m_block2Id,
+            cls.BufferPointer(),
+            idx2cls.BufferPointer(),            
+            etp.NzValues(),
+            etp.MajorIndexLocation(),
+            etp.SecondaryIndexLocation());
+
+        blocksPerGrid = label.m_nz;
+        _normalizePrediction<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
+            label.m_nz,
+            label.m_expandedSize,
+            label.MajorIndexLocation(),
+            label.m_block2Id, 
+            etp.MajorIndexLocation(),
+            etp.m_pArray,
+            entropyScore.BufferPointer());
+
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+   }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyError(GPUSparseMatrix<ElemType>& a)
+    {
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+
+        int N = a.m_nz;
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); 
+
+        _computePredictionError<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
+            a.m_pArray,
+            N);
+
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd)
+    {
+        int deviceId = error.GetComputeDeviceId();
+        if (weight.GetComputeDeviceId()!=deviceId || grd.GetComputeDeviceId()!=deviceId )
+            throw std::runtime_error("GPUSparseMatrix::ClassEntropyGradientOfInput() All matrices must be on the same GPU");
+
+        grd.SetValue((ElemType)0); 
+        cudaEvent_t done = nullptr; 
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+
+        size_t blocksPerGrid = grd.GetNumElements();
+        //_computeGradientOfInput<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
+        _computeGradientOfInput<ElemType><<<blocksPerGrid, 20>>>(
+            error.m_pArray,
+            error.MajorIndexLocation(),
+            error.SecondaryIndexLocation(),
+            weight.BufferPointer(),
+            weight.GetNumRows(),
+            grd.BufferPointer(), 
+            grd.GetNumRows());
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+    
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd)
+    {
+        int deviceId = error.GetComputeDeviceId();
+        if (input.GetComputeDeviceId()!=deviceId || label.GetComputeDeviceId()!=deviceId || cls.GetComputeDeviceId()!=deviceId  || idx2cls.GetComputeDeviceId()!=deviceId || grd.GetComputeDeviceId()!=deviceId )
+            throw std::runtime_error("GPUSparseMatrix::ClassEntropyGradientOfWeight() All matrices must be on the same GPU");
+
+        grd.SetFormat(matrixFormatSparseBlockRow);  
+        size_t nz = label.m_blockSize * grd.GetNumCols();        
+        //allocate enough memory
+        if(grd.m_elemSizeAllocated < nz) 
+        {
+            grd.Resize(grd.GetNumRows(), grd.GetNumCols(), nz);
+        }
+        grd.m_blockSize = label.m_blockSize;      
+        grd.m_nz = nz;
+        CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
+        CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
+
+        cudaEvent_t done = nullptr;  
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+
+        size_t blocksPerGrid = error.m_nz;
+        _computeGradientOfWeight<ElemType><<<blocksPerGrid, threadsPerBlock>>>(
+            error.m_pArray,
+            error.MajorIndexLocation(),
+            error.SecondaryIndexLocation(),
+            input.GetNumCols(),
+            idx2cls.GetNumRows(),
+            label.MajorIndexLocation(),
+            label.m_block2UniqId,
+            cls.BufferPointer(),
+            idx2cls.BufferPointer(),              
+            input.BufferPointer(),
+            input.GetNumRows(),
+            grd.BufferPointer(), 
+            grd.m_blockIds);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
+    {
+        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow ||
+            m_format == matrixFormatSparseCSR || m_format == matrixFormatSparseCSC)
+        {
+            long N=(long)GetNumNZElements();
+            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            ElemType * values = NzValues();
+            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        } 
+        else 
+        {
+            NOT_IMPLEMENTED;
+        }
+        return *this;
+    } 
+
+    // normal update for smoothed gradients c and current gradients (this)
+    template<class ElemType> 
+    void GPUSparseMatrix<ElemType>::NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum)
+    {
+        if (c.IsEmpty())
+        {
+            c.Resize(GetNumRows(), GetNumCols());
+            c.SetValue(0.0);
+        }
+
+        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) 
+        {
+            bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            LONG64 N = (LONG64)GetNumNZElements();
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+
+            _normalGradForSparseBlock<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
+                momentum,
+                isBlockCol,
+                GetNumRows(),
+                GetNumCols(),
+                m_blockSize,
+                NzValues(),
+                m_blockIds,
+                c.BufferPointer());
+
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        } 
+        else 
+        {
+            NOT_IMPLEMENTED;
+        }
+    }
+
+    //-------------------------------------------------------------------------
+    // End of new GPU Sparse Matrix code 
+    //-------------------------------------------------------------------------
+
+    //sparse X dense = dense
+    template<class ElemType>
+    void  GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA, 
+        const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
+    {
+        if (a.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        if (transposeD)
+            NOT_IMPLEMENTED;
+
+        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId()||(b.GetComputeDeviceId()!=a.GetComputeDeviceId()))
+            throw std::runtime_error("MultiplyAndWeightedAdd: All matrices must be on the same GPU");
+
+        a.PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+        cusparseMatDescr_t descr = 0;
+        CUSPARSECALL(cusparseCreateMatDescr(&descr));
+        cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+        cusparseOperation_t oper = transposeA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+        int m = (int)a.GetNumRows();
+        int n = (int)b.GetNumCols();
+        assert(n==(int)c.GetNumCols());
+        int k = (int)a.GetNumCols();
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CUSPARSECALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.NzValues()),
+                a.RowLocation(), a.ColLocation(), reinterpret_cast <float*>(b.BufferPointer()),
+                (int)b.GetNumRows(),reinterpret_cast <float*>(&beta),reinterpret_cast <float*>(c.BufferPointer()),(int)c.GetNumRows()));
+        }
+        else 
+        {
+            CUSPARSECALL(cusparseDcsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <double*>(&alpha),descr,reinterpret_cast <const double*>(a.NzValues()),
+                a.RowLocation(), a.ColLocation(), reinterpret_cast <double*>(b.BufferPointer()),
+                (int)b.GetNumRows(),reinterpret_cast <double*>(&beta),reinterpret_cast <double*>(c.BufferPointer()),(int)c.GetNumRows()));
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        CUSPARSECALL(cusparseDestroy(cusparseHandle));        
+    }
+       
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C)
+    {
+        C.Resize(S.GetNumRows(), D.GetNumCols());
+
+        MultiplyAndWeightedAdd(1,S,false,D,false,0,C);
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C)
+    {   
+        C.Resize(S.GetNumCols(),D.GetNumRows());
+
+        MultiplyAndWeightedAdd(1,D,false,S,false,0,C);     
+    }
+
+    // ElemCountFromBufferSize - Return the elemCountAllocated for a particular buffersize
+    // totalBufferSize - total buffer we have to use
+    // return: size of allocated elements/index slots available
+    template<class ElemType>
+    size_t GPUSparseMatrix<ElemType>::ElemCountFromBufferSize(const size_t totalBufferSize) const
+    {
+        size_t elemSizeAllocated;
+        if (m_format & matrixFormatCompressed)
+        {
+            elemSizeAllocated = (totalBufferSize - SecondaryIndexSize()) / (sizeof(GPUSPARSE_INDEX_TYPE)+sizeof(ElemType));
+        }
+        else // uncompressed COO format
+        {
+            elemSizeAllocated = totalBufferSize / (2 * sizeof(GPUSPARSE_INDEX_TYPE)+sizeof(ElemType));
+        }
+        return elemSizeAllocated;
+    }
+
+    template<class ElemType>
+    size_t GPUSparseMatrix<ElemType>::ElemCountFromBufferSize() const
+    {
+        return ElemCountFromBufferSize(m_totalBufferSizeAllocated);
+    }
+
+    // PrepareBuffer - Get the dimensions start buffer, computes the starting row/column of each value
+    // m - rows in the source
+    // n - cols in the source
+    // canReuseBuffer - target matrix can be reused for temporary space
+    // func - function to call to count elements in the result (returns count, and fills csrRowPtr array)
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func)
+    {
+        if (this->m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        PrepareDevice();
+
+        GPUSPARSE_INDEX_TYPE* csrRowPtrC=nullptr;
+        GPUSparseMatrix<ElemType>& c = *this;
+        size_t cSize = c.BufferSizeAllocated();
+        size_t rowBufferRequired = (m + 1)*sizeof(GPUSPARSE_INDEX_TYPE);
+        bool allocatedBuffer = false;
+
+        // do we have enough memory to store just the row buffer?
+        if (cSize >= rowBufferRequired && c.NzValues() != nullptr && canReuseBuffer)
+        {
+            csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.NzValues();
+        }
+        else
+        {
+            CUDACALL(cudaMalloc((void **)&csrRowPtrC, rowBufferRequired));
+            allocatedBuffer = true;
+        }
+
+        // get the non-zero count from the function (and 
+        size_t nnzC = func(csrRowPtrC);
+
+        // now we know the number of Non-zeros in the result set, set the output size
+        c.Resize(m, n, nnzC);
+        c.m_nz = nnzC;
+
+        CUDACALL(cudaMemcpy(c.SecondaryIndexLocation(),csrRowPtrC,c.SecondaryIndexSize(),cudaMemcpyDeviceToDevice));
+
+        // if we allocated the buffer, free it here
+        if (allocatedBuffer)
+            CUDACALL(cudaFree(csrRowPtrC));
+    }
+
+    // Multiply - multiply one spares matrix by another sparse matrix
+    // S1 - first sparse matrix
+    // transposeS1 - transpose first matrix?
+    // S2 - second sparse matrix
+    // transposeS2 - tanspose second matrix?
+    // c - result matrix
+    // NOTE: if c has enough space allocated, it will be reused, otherwise it will be freed and a new memory block used
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &c)
+    {
+        if (S1.m_format != matrixFormatSparseCSR || S2.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        if (S1.GetComputeDeviceId()!=S2.GetComputeDeviceId())
+            throw std::runtime_error("Sparse matrix multiply: both matrices must be on the same device");
+
+        S1.PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+        cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0;
+        CUSPARSECALL(cusparseCreateMatDescr(&descrA)); CUSPARSECALL(cusparseCreateMatDescr(&descrB)); CUSPARSECALL(cusparseCreateMatDescr(&descrC));        
+        cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrC,CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
+        cusparseOperation_t operA = transposeS1 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t operB = transposeS2 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+        int m = int(transposeS1 ? S1.GetNumCols() : S1.GetNumRows());
+        int n = int(transposeS2 ? S2.GetNumRows() : S2.GetNumCols());
+        int k = int(transposeS1 ? S1.GetNumRows() : S1.GetNumCols());
+        int l = int(transposeS2 ? S2.GetNumCols() : S2.GetNumRows());
+        if (k!=l)
+            throw std::runtime_error("Sparse matrix multiply: dimensionality mismatch");
+
+        int nnzA = (int)S1.GetNumNZElements();
+        int nnzB = (int)S2.GetNumNZElements();
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        //Step 1 
+        c.PrepareBuffer(m, n, false, // false means we cannot reuse the "c" buffer if it exists for temporaries
+            [&](GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t
+        {
+            int nnzTotal = -1; 
+            CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB,
+                S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
+            return nnzTotal;
+        });
+
+
+        //Step 2
+        if (sizeof(float)==sizeof(ElemType))
+        {
+            CUSPARSECALL(cusparseScsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const float*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
+                descrB,nnzB,(const float*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
+                descrC,(float*)c.NzValues(),c.RowLocation(),c.ColLocation()));
+        }
+        else
+        {
+            CUSPARSECALL(cusparseDcsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const double*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
+                descrB,nnzB,(const double*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
+                descrC,(double*)c.NzValues(),c.RowLocation(),c.ColLocation()));
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        cusparseDestroy(cusparseHandle);   
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB)
+    {
+        Multiply(a,transposeA,b,transposeB,*this);
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c)
+    {
+        if (a.m_format != matrixFormatSparseCSR || b.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        if (a.GetNumCols() != b.GetNumCols() || a.GetNumRows() != b.GetNumRows())
+            throw std::runtime_error("Dimensions mismatch in ScaleAndAdd");
+        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId())
+            throw std::runtime_error("ScaleAndAdd: matrices must be on the same device");
+
+        int m = (int)a.GetNumRows();
+        int n = (int)a.GetNumCols();
+        int nnzA = (int)a.GetNumNZElements();
+        int nnzB = (int)b.GetNumNZElements();
+
+        a.PrepareDevice();
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+        cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0;
+        CUSPARSECALL(cusparseCreateMatDescr(&descrA)); CUSPARSECALL(cusparseCreateMatDescr(&descrB)); CUSPARSECALL(cusparseCreateMatDescr(&descrC));
+        cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        //Step 1 
+        bool inOutParameter = (&b == &c);
+        c.PrepareBuffer(m, n, !inOutParameter, [&] (GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t
+        {
+            int nnzTotal = -1;
+            CUSPARSECALL(cusparseXcsrgeamNnz(cusparseHandle,m,n,descrA,nnzA,a.RowLocation(),a.ColLocation(),descrB,nnzB,b.RowLocation(),b.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
+            return nnzTotal;
+        });
+
+        //Step 2
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CUSPARSECALL(cusparseScsrgeam(cusparseHandle,m,n,reinterpret_cast <const float*>(&alpha),descrA,nnzA,reinterpret_cast <const float*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
+                reinterpret_cast <const float*>(&beta),descrB,nnzB,reinterpret_cast <const float*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <float*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
+        }
+        else
+        {
+            CUSPARSECALL(cusparseDcsrgeam(cusparseHandle,m,n,reinterpret_cast <const double*>(&alpha),descrA,nnzA,reinterpret_cast <const double*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
+                reinterpret_cast <const double*>(&beta),descrB,nnzB,reinterpret_cast <const double*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <double*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        cusparseDestroy(cusparseHandle);   
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        if (a.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        if (a.GetNumRows() != b.GetNumRows() || a.GetNumRows() != c.GetNumRows() || a.GetNumCols() != b.GetNumCols() || a.GetNumCols() != c.GetNumCols())
+            throw std::logic_error("ScaleAndAdd: dimension mismatch");
+        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId()||a.GetComputeDeviceId()!=c.GetComputeDeviceId())
+            throw std::runtime_error("ScaleAndAdd: matrices must be on the same device");
+        b.PrepareDevice();
+        //copy b to c
+        CUDACALL(cudaMemcpy(c.BufferPointer(),b.BufferPointer(),sizeof(ElemType)*b.GetNumElements(),cudaMemcpyDeviceToDevice));
+        if (beta!=1)
+        {
+            c*=beta;
+        }
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        long M=(long)a.GetNumRows();
+        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
+        _sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {
+        ScaleAndAdd(beta,b,alpha,a,c);
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a)
+    {
+        if (a.IsEmpty())
+            return;
+
+        long N=(long)a.GetNumNZElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _scaleArray<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c)
+    {
+        if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
+        {
+            throw std::invalid_argument("All matrices must be on the same GPU");
+        }
+        else 
+        {
+            if (a.IsEmpty())
+                throw std::logic_error("ElementWisePower:  The input matrix a is empty.");
+
+            c.ResizeAsAndCopyIndexFrom(a);
+
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            a.PrepareDevice();
+            long N=(long)a.GetNumNZElements();
+            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),c.NzValues(),N);             
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        }
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.m_format != matrixFormatSparseCSR && a.m_format != matrixFormatSparseCSC)
+            NOT_IMPLEMENTED;
+
+        if (a.GetComputeDeviceId()!=b.GetComputeDeviceId())
+            throw std::runtime_error("a and b must be on the same device");
+
+        int m = (int)a.GetNumRows();
+        int n = (int)a.GetNumCols();
+        int nnz = (int)a.GetNumNZElements();
+
+        ElemType* cscValA = nullptr;
+        GPUSPARSE_INDEX_TYPE* cscRowIndA = nullptr;
+        GPUSPARSE_INDEX_TYPE* cscColPtrA = nullptr;
+
+        cudaEvent_t done = nullptr;
+        cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        cusparseHandle_t cusparseHandle = 0;
+
+        if (a.m_format == matrixFormatSparseCSR)         //need to put a in ColumnMajor format
+        {
+            a.PrepareDevice();
+            CUDACALL(cudaMalloc((void **)&cscValA, nnz*sizeof(ElemType)));
+            CUDACALL(cudaMalloc((void **)&cscRowIndA, nnz*sizeof(GPUSPARSE_INDEX_TYPE)));
+            CUDACALL(cudaMalloc((void **)&cscColPtrA, (n + 1)*sizeof(GPUSPARSE_INDEX_TYPE)));
+
+            CUSPARSECALL(cusparseCreate(&cusparseHandle));
+            if (do_sync)    CUDACALL(cudaEventCreate(&done));
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
+            }
+            if (do_sync)    CUDACALL(cudaEventRecord(done));
+            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        }
+        else if (a.m_format == matrixFormatSparseCSC)
+        {
+            cscValA = (ElemType*)a.NzValues();
+            cscRowIndA = a.RowLocation();
+            cscColPtrA = a.ColLocation();
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+        //Given sparse matrix in column major format, calculate indices for corresponding sparse vector
+        GPUSPARSE_INDEX_TYPE* vectArray=nullptr;
+        CUDACALL(cudaMalloc((void**)&vectArray,sizeof(GPUSPARSE_INDEX_TYPE)*a.m_nz));
+        long M=n;
+        long N=m;
+        //GPUSPARSE_INDEX_TYPE* h_vectArray= new int[a.m_nz];
+        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);   
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _getSparseVectorRepresntationForCSCMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        CUDACALL(cudaFree(cscRowIndA));
+        CUDACALL(cudaFree(cscColPtrA));
+        //CUDACALL(cudaMemcpy(h_vectArray,vectArray,sizeof(GPUSPARSE_INDEX_TYPE)*a.m_nz,cudaMemcpyDeviceToHost));    
+
+        //Actual dot product
+        ElemType res=0;
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            CUSPARSECALL(cusparseSdoti(cusparseHandle,(int)a.m_nz,reinterpret_cast<float*>(cscValA),vectArray,
+                reinterpret_cast<float*>(b.BufferPointer()),
+                reinterpret_cast<float*>(&res),idxBase));
+        }
+        else
+        {
+            CUSPARSECALL(cusparseDdoti(cusparseHandle,(int)a.m_nz,reinterpret_cast<double*>(cscValA),vectArray,
+                reinterpret_cast<double*>(b.BufferPointer()),
+                reinterpret_cast<double*>(&res),idxBase));
+        }       
+        CUDACALL(cudaFree(vectArray));
+        CUDACALL(cudaFree(cscValA));
+        CUSPARSECALL(cusparseDestroy(cusparseHandle));   
+        return res;        
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
+    {
+        return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(b,a);
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+        if (a.GetNumNZElements()!=b.GetNumNZElements() || a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+
+        if (a.m_format != b.m_format)
+            NOT_IMPLEMENTED;
+
+        a.PrepareDevice();
+        long *res = new long[3];
+        res[0]=1;
+        res[1]=1;
+        res[2]=1;
+        long *d_res = nullptr;
+        CUDACALL(cudaMalloc((void**)&d_res,sizeof(long)*3)); 
+        CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));
+
+        int blocksPerGrid =(int)ceil(1.0*a.GetNumNZElements()/threadsPerBlock); 
+        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),b.NzValues(),(long)a.GetNumNZElements(),threshold,d_res);        
+        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNumNZElements(),(int)threshold,d_res+1);
+        blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock); 
+        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
+
+        CUDACALL(cudaMemcpy(res,d_res,sizeof(long)*3,cudaMemcpyDeviceToHost));        
+        if (res[0]*res[1]*res[2]==1)
+            return true;
+        else
+            return false;
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+        GPUSparseMatrix<ElemType> c(b.GetFormat(), b.GetComputeDeviceId());
+        c.SetValue(a);
+        return AreEqual(c,b,threshold);
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+        if (a.GetNumRows()  != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+            return false;
+        GPUSparseMatrix<ElemType> c(a.GetFormat(),a.GetComputeDeviceId());
+        c.SetValue(b);
+        return AreEqual(a,c,threshold);
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold) const
+    {
+        return AreEqual(*this,a,threshold);
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold) const
+    {
+        return AreEqual(*this,a,threshold);
+    }
+#pragma endregion Static BLAS Functions
+
+#pragma region Member BLAS Functions
+
+    template<class ElemType>
+    DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
+    {
+        // for externally managed memory the CUDA context will have the current device
+        if (!OwnBuffer())
+        {
+            DEVICEID_TYPE devId;
+            CUDACALL(cudaGetDevice(&devId));
+            return devId;
+        }
+        else
+            return m_computeDevice;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        if (a.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        if (a.GetNumRows()!=b.GetNumRows()||a.GetNumCols()!=b.GetNumCols())
+            throw std::logic_error("ElementProductOf: matrix dimensions mismatch");
+
+        b.PrepareDevice();        
+        GPUMatrix<ElemType> c(b.GetNumRows(),b.GetNumCols(),b.GetComputeDeviceId());
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        long M=(long)a.GetNumRows();
+        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
+        _sparseCSRElemMulDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),a.RowLocation(),a.ColLocation(),b.BufferPointer(),c.BufferPointer(),M);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
+    {
+        return GPUSparseMatrix<ElemType>::ElementProductOf(b,a);        
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator+ (const GPUSparseMatrix<ElemType>& a) const
+    {
+        GPUSparseMatrix<ElemType> res(GetFormat(), GetComputeDeviceId());
+        GPUSparseMatrix<ElemType>::ScaleAndAdd(1,*this,1,a,res);
+        return res;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator- (const GPUSparseMatrix<ElemType>& a) const
+    {
+        GPUSparseMatrix<ElemType> res(GetFormat(), GetComputeDeviceId());
+        GPUSparseMatrix<ElemType>::ScaleAndAdd(1, *this, -1, a, res);
+        return res;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
+    {
+        GPUSparseMatrix<ElemType>& us = *this;
+        ElementWisePower(alpha, us, us);
+        return us;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^ (ElemType alpha) const
+    {
+        GPUSparseMatrix<ElemType> c(GetFormat(), GetComputeDeviceId());
+        ElementWisePower(alpha, *this, c);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator*=(ElemType alpha)
+    {
+        GPUSparseMatrix<ElemType>& us = *this;
+        if (alpha!=1)            
+            Scale(alpha,us);
+        return us;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator* (ElemType alpha) const
+    {
+        GPUSparseMatrix<ElemType> c(*this);
+        if (alpha!=1)
+            Scale(alpha, c);
+        return c;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power)
+    {
+        ElementWisePower(power, a, *this);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::Transpose() const
+    {
+        int m = (int)GetNumRows();
+        int n = (int)GetNumCols();
+        int nnz = (int)GetNumNZElements();
+        cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+
+        assert(GetFormat()&matrixFormatCompressed); // for now this only supports compressed formats
+        PrepareDevice();
+        GPUSparseMatrix c(GetFormat(), GetComputeDeviceId());
+        c.Resize(n, m, nnz, GetFormat());
+        c.m_nz = nnz;
+
+        cusparseHandle_t cusparseHandle = 0;
+        CUSPARSECALL(cusparseCreate(&cusparseHandle));
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        if (m_format == MatrixFormat::matrixFormatSparseCSR)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
+                    reinterpret_cast<float*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
+                    reinterpret_cast<double*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
+            }
+        }
+        else if (m_format == matrixFormatSparseCSC)
+        {
+            if (sizeof(ElemType) == sizeof(float))
+            {
+                CUSPARSECALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
+                    reinterpret_cast<float*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
+            }
+            else
+            {
+                CUSPARSECALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
+                    reinterpret_cast<double*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
+            }
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        CUSPARSECALL(cusparseDestroy(cusparseHandle));        
+        return c;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTransposeOf(const GPUSparseMatrix<ElemType>& a)
+    {
+        if (this == &a)
+            throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose.");
+
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTransposeOf: Matrix a is empty.");
+
+        *this = a.Transpose();
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::InplaceTranspose()
+    {
+        if (IsEmpty())
+            return;
+        // transfer converted block over to this pointer
+        *this = std::move(Transpose());
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfAbsElements: Matrix is empty");
+
+        cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
+        if (sizeof(ElemType)==sizeof(float))
+        {
+            float res=0;
+            cublasSasum(cuHandle,(int)GetNumNZElements(),reinterpret_cast<float*>(m_pArray),1,&res);
+            return res;
+        }
+        else
+        {
+            double res=0;
+            cublasDasum(cuHandle,(int)GetNumNZElements(),reinterpret_cast<double*>(m_pArray),1,&res);
+            return ElemType(res);
+        }         
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("SumOfElements: Matrix is empty");
+
+        PrepareDevice();
+        ElemType* d_sum = nullptr;
+        ElemType h_sum;
+        CUDACALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum<ElemType><<<1,1024>>>(NzValues(),d_sum,(LONG64)GetNumNZElements());
+        CUDACALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDACALL(cudaFree(d_sum));               
+        return h_sum;        
+    }
+
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const 
+    {
+        if (IsEmpty())
+            throw std::logic_error("FrobeniusNorm: Matrix is empty.");
+
+        ElemType* d_sum = nullptr;
+        ElemType h_sum=0;
+        CUDACALL(cudaMalloc((void**)&d_sum,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionSum2<ElemType><<<1,1024>>>(m_pArray,d_sum,(int)GetNumNZElements());
+        CUDACALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDACALL(cudaFree(d_sum));               
+        if (sizeof(ElemType)==sizeof(float))
+            return (ElemType)sqrtf((float)h_sum);
+        else
+            return (ElemType)sqrt((double)h_sum);
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("MatrixNorm1: Matrix is empty.");
+
+        ElemType* d_maxAbs = nullptr;
+        ElemType h_maxAbs=0;
+        CUDACALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType)));
+        //WARNING: THIS kernel is not the most efficient way!
+        _reductionMatrixNormInf<ElemType><<<1,1024>>>(m_pArray,d_maxAbs,(int)GetNumNZElements());
+        CUDACALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost));
+        CUDACALL(cudaFree(d_maxAbs));               
+        if (sizeof(ElemType)==sizeof(float))
+            return h_maxAbs;
+        else
+            return h_maxAbs; 
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
+    {
+        if (IsEmpty())
+            throw std::logic_error("MatrixNorm1: Matrix is empty.");
+        return SumOfAbsElements();              
+    }
+
+#pragma endregion Member BLAS Functions
+
+#pragma region Other Functions
+
+    template<class ElemType>    
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse ()
+    {
+        if (IsEmpty())
+            throw std::logic_error("ElementInverse: Matrix is empty.");
+
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        return ElementInverse();
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
+    {
+        performInplaceFunction(0);                    
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceSigmoid();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLinearRectifierDerivative()
+    {
+        performInplaceFunction(6);                    
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceLinearRectifierDerivative();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTanh()
+    {
+        performInplaceFunction(1);
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTanhOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceTanh();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSqrt()
+    {
+        performInplaceFunction(2);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceSqrt();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
+    {
+        performInplaceFunction(3);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceExp();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
+    {
+        performInplaceFunction(4);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceLog();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceAbs()
+    {
+        performInplaceFunction(5);        
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        SetValue(a);
+        InplaceAbs();
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncateBottom: Matrix is empty.");
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty.");
+
+        if (this!=&a)
+        {
+            //Resize(a.GetNumRows(), a.GetNumCols());           
+            ResizeAsAndCopyIndexFrom(a);  
+        }
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.NzValues(),threshold,N);                        
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }   
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("InplaceTruncateTop: Matrix is empty.");
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
+    {
+        if (a.IsEmpty())
+            throw std::logic_error("AssignTruncateTopOf: Matrix a is empty.");
+
+        if (this!=&a)
+        {
+            ResizeAsAndCopyIndexFrom(a);
+        }
+
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,a.NzValues(),threshold,N);                        
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
+    {
+        if (IsEmpty())
+            throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty.");
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,threshold,N);
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+        return *this;
+    }
+
+
+#pragma endregion
+
+#pragma region Helper Functions
+
+    //outBuffer should be allocated to be >= size by the caller 
+    template<class ElemType>
+    template <class OutType, class InType>
+    void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size)
+    {
+#pragma omp parallel for
+        for (size_t i = 0; i<(size & ~3); i += 4)
+        {
+            outBuffer[i] = inBuffer[i];
+            outBuffer[i + 1] = inBuffer[i + 1];
+            outBuffer[i + 2] = inBuffer[i + 2];
+            outBuffer[i + 3] = inBuffer[i + 3];
+        }
+        //handle remaining stuffs
+        for (size_t i = size & ~3; i<size; i++)
+        {
+            outBuffer[i] = inBuffer[i];
+        }
+    }
+
+    template<class ElemType>
+    void* GPUSparseMatrix<ElemType>::ReserveTempHostBuffer(const size_t sizeInByte) const
+    {
+        if (m_tempHostBufferSize < sizeInByte)
+        {
+            delete[] m_tempHostBuffer;
+            m_tempHostBuffer = new byte[sizeInByte];
+            m_tempHostBufferSize = sizeInByte;
+        }
+        return (void*)m_tempHostBuffer;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::performInplaceFunction(int kind)
+    {        
+        long N=(long)GetNumNZElements();
+        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDACALL(cudaEventCreate(&done));
+        switch (kind)
+        {
+        case 0:
+            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 1:
+            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 2:
+            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 3:
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 4:
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 5:
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+            break;
+        case 6:
+            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(m_pArray,N);
+        } 
+        if (do_sync)    CUDACALL(cudaEventRecord(done));
+        if (do_sync)    CUDACALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDACALL(cudaEventDestroy(done));
+    }
+
+ 
+
+#pragma endregion Helper Functions
+
+    template class GPUSparseMatrix<float>; 
+    template class GPUSparseMatrix<double>;    
+
+    template <class ElemType>
+    MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
+    {
+        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
+        size_t elsize;
+        stream>>elsize;
+        if (sizeof(ElemType)!=elsize)
+            throw std::runtime_error("Template argument size doesn't match those in file");
+        std::wstring matrixName;
+
+        // now prepare this header to receive the data being read
+        size_t nz, colnum, rownum;
+        int format;
+
+        // read in the header information
+        stream>>matrixName>>format>>nz>>colnum>>rownum;
+
+        us.m_format = (MatrixFormat)format;
+        if (us.m_format != matrixFormatSparseCSC && us.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        us.Resize(rownum, colnum, nz);
+        us.SetNzCount(nz);
+
+        if (nz > 0)
+        {
+            size_t compressedSize = (us.m_format == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1;
+            ElemType* dataBuffer = new ElemType[nz];
+            GPUSPARSE_INDEX_TYPE * unCompressedIndex = new GPUSPARSE_INDEX_TYPE[nz];
+            GPUSPARSE_INDEX_TYPE * compressedIndex = new GPUSPARSE_INDEX_TYPE[compressedSize];
+
+            // read in the sparse matrix info
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream >> dataBuffer[i];
+            }
+            for (size_t i = 0; i < nz; ++i)
+            {
+                size_t val;
+                stream >> val;
+                unCompressedIndex[i] = val;
+            }
+            for (size_t i = 0; i < compressedSize; ++i)
+            {
+                size_t val;
+                stream >> val;
+                compressedIndex[i] = val;
+            }
+
+            if (us.m_format == matrixFormatSparseCSC)
+                us.SetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, rownum, colnum);
+            else if (us.m_format == matrixFormatSparseCSR)
+                us.SetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, rownum, colnum);
+
+            delete[] dataBuffer;
+            delete[] unCompressedIndex;
+            delete[] compressedIndex;
+        }
+
+        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
+        us.SetMatrixName(matrixName.c_str());
+
+        return stream;
+    }
+
+    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<float>& us);
+    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<double>& us);
+
+    template <class ElemType>
+    MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemType>& us)
+    {
+        if (us.m_format != matrixFormatSparseCSC && us.m_format != matrixFormatSparseCSR)
+            NOT_IMPLEMENTED;
+
+        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
+        stream<<sizeof(ElemType);
+        if (us.GetMatrixName()==nullptr)
+        {
+            std::wstring s(L"nnmatrix");
+            stream<<s;
+        }
+        else
+        {
+            stream<<us.GetMatrixName();
+        }
+
+        size_t nz = us.GetNumNZElements(), numRows=us.GetNumRows(), numCols=us.GetNumCols();
+        size_t compressedSize = us.SecondaryIndexCount();
+        int format = us.GetFormat();
+
+        stream << format << nz << numCols << numRows;
+
+        if (nz > 0)
+        {
+            ElemType *dataBuffer = nullptr;
+            GPUSPARSE_INDEX_TYPE* compressedIndex = nullptr;
+            GPUSPARSE_INDEX_TYPE* unCompressedIndex = nullptr;
+
+            if (us.m_format == matrixFormatSparseCSC)
+                us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
+            else if (us.m_format == matrixFormatSparseCSR)
+                us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
+            else
+                NOT_IMPLEMENTED;
+
+            for (size_t i = 0; i < nz; ++i)
+            {
+                stream << dataBuffer[i];
+            }
+            for (size_t i = 0; i < nz; ++i)
+            {
+                size_t val = unCompressedIndex[i];
+                stream << val;
+            }
+            for (size_t i = 0; i < compressedSize; ++i)
+            {
+                size_t val = compressedIndex[i];
+                stream << val;
+            }
+
+            delete[] dataBuffer;
+            delete[] unCompressedIndex;
+            delete[] compressedIndex;
+        }
+
+        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
+
+        return stream;
+    }
+
+    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<float>& us);
+    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<double>& us);
+
+}}}
+
+#endif  // CPUONLY
diff --git a/Math/Math/GPUSparseMatrix.h b/Math/Math/GPUSparseMatrix.h
index d7b6a2269..4c68401f5 100644
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@@ -1,266 +1,265 @@
-﻿//
-// <copyright file="GPUSparseMatrix.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "GPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include <functional>
-
-namespace Microsoft { namespace MSR { namespace CNTK {    
-
-    //GPU Sparse Matrix, using cuSPARSE library.
-    //By default we are assuming CSR representation
-    // NOTE m_elemSizeAllocated (in base matrix) means the number of non-zero elements we have allocated space
-    // We are packing the CSR format (pointed to by m_pArray) as follows:
-    // ElemType elements[m_elemSizeAllocated]
-    // int colIdx[m_elemSizeAllocated]
-    // int rowIdxStart[m_numRows+1]
-
-    template<class ElemType>
-    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
-    {
-        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
-
-    public:
-        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-
-        GPUSparseMatrix(const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR,
-            const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-    
-        GPUSparseMatrix(const GPUSparseMatrix<ElemType>&);
-
-        GPUSparseMatrix(const GPUMatrix<ElemType>&, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
-
-#ifndef    LINUX
-        GPUSparseMatrix(GPUSparseMatrix<ElemType>&&);
-#endif    /* LINUX */
-
-        ~GPUSparseMatrix();
-
-    public:
-        void Reset();
-
-    public:
-        // return col pointer, which is immediately following the non-zero element
-        // in memory format is always in the following order:
-        // Non-zero data elements, Full index locations, compressed index locations
-        // In CSR row data is compressed, in CSC col data is compressed
-        const ElemType* NzValues() const {return m_pArray;}
-        ElemType* NzValues() {return m_pArray;}
-        size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
-
-        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
-        size_t MajorIndexCount() const { return m_nz; }
-        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
-
-        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
-        size_t SecondaryIndexCount(const size_t numNZ) const 
-        {
-            if (m_format&matrixFormatCompressed)
-            {
-                size_t cnt = (m_format&matrixFormatRowMajor)?m_numRows:m_numCols;
-                if (cnt > 0) cnt++; // add an extra element on the end for the "max" value
-                return cnt;
-            }
-            else
-                return numNZ; // COO format
-        }
-
-        size_t SecondaryIndexCount() const
-        {
-            return SecondaryIndexCount(m_nz);
-        }
-
-        // get size for compressed index
-        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
-
-        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
-        size_t BufferSizeNeeded(const size_t numNZ) const 
-        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
-
-        size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
-        ElemType* BufferPointer() const;
-
-        // the column and row locations will swap based on what format we are in. Full index always follows the data array
-        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
-        size_t RowSize() const {return (m_format&matrixFormatRowMajor)?SecondaryIndexSize():MajorIndexSize();} 
-        GPUSPARSE_INDEX_TYPE* ColLocation() const { return (m_format&matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation(); }
-        size_t ColSize() const {return (m_format&matrixFormatRowMajor)?MajorIndexSize():SecondaryIndexSize();} // actual number of bytes in use
-
-        void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix);
-
-        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
-
-        GPUSparseMatrix<ElemType> Transpose() const;
-        void InplaceTranspose();
-        GPUSparseMatrix<ElemType>& AssignTransposeOf(const GPUSparseMatrix<ElemType>& a);
-
-        GPUMatrix<ElemType> CopyToDenseMatrix() const;
-        void CopyToDenseMatrix(GPUMatrix<ElemType> &denseMatrix) const;
-        void CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const;
-        void ChangeDeviceTo(DEVICEID_TYPE toId);
-
-        GPUSparseMatrix<ElemType>& operator=(const GPUSparseMatrix<ElemType>& deepCopy);
-#ifndef    LINUX
-        GPUSparseMatrix<ElemType>& operator=(GPUSparseMatrix<ElemType>&& moveFrom);
-#endif    /* LINUX */
-        GPUSparseMatrix<ElemType> operator+ (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType> operator- (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType>& operator^= (const ElemType alpha); //element-wise power        
-        GPUSparseMatrix<ElemType> operator^ (const ElemType alpha) const; //element-wise power
-        GPUSparseMatrix<ElemType>& operator*= (const ElemType alpha);
-        GPUSparseMatrix<ElemType> operator*(const ElemType alpha) const;
-        GPUSparseMatrix<ElemType>& AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power);        
-
-        bool IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-    public:
-        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
-        size_t GetNumNZElements() const {return m_nz;}
-
-        //Sets sparse matrix in CSR format. this acts as deep copy
-        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
-        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
-        void GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void ConvertToSparseFormat(MatrixFormat newFormat);
-        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
-
-    public:
-        GPUSparseMatrix<ElemType>& ElementInverse ();
-        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLinearRectifierDerivative();
-        GPUSparseMatrix<ElemType>& AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSigmoid ();
-        GPUSparseMatrix<ElemType>& AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTanh ();
-        GPUSparseMatrix<ElemType>& AssignTanhOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSqrt ();
-        GPUSparseMatrix<ElemType>& AssignSqrtOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceExp ();
-        GPUSparseMatrix<ElemType>& AssignExpOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLog ();
-        GPUSparseMatrix<ElemType>& AssignLogOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceAbs ();   
-        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-        GPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
-
-        ElemType SumOfElements () const; //sum of all elements
-        ElemType SumOfAbsElements () const; //sum of all abs(elements)
-        ElemType FrobeniusNorm() const;
-        ElemType MatrixNormInf() const;
-        ElemType MatrixNorm1() const;
-        ElemType MatrixNorm0() const { return (ElemType)GetNumNZElements(); };
-    public:        
-        //Performs C = alpha ∗ op ( S ) ∗ D + beta ∗ C; Where S is sparse and D and C are dense
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, 
-            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& S, const bool transposeS, const GPUMatrix<ElemType>& D, 
-            const bool transposeD, ElemType beta, GPUMatrix<ElemType>& C);
-        static void MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, 
-            const bool transposeB, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
-        
-        static void ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
-            const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-            const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore);
-        static void ClassEntropyError(GPUSparseMatrix<ElemType>& a);
-        static void ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd);
-        static void ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd);
-
-        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
-        
-        static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &C);
-        GPUSparseMatrix<ElemType>& AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB);
-
-        static ElemType InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static ElemType InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a);
-        static void ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-
-        //For these two, I should also add a version which would return GPUSparseMatrix, since Dense.*Sparse =Sparse.*Dense=Sparse
-        static GPUMatrix<ElemType> ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static GPUMatrix<ElemType> ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);     
-
-    public:
-        // See: http://stackoverflow.com/questions/4660123/overloading-friend-operator-for-template-class/4661372#4661372
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemTypeDummy>& us);
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemTypeDummy>& us);
-
-     private:
-         void* ReserveTempHostBuffer(const size_t sizeInByte) const;
-         template <class OutType, class InType>
-         static void CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size);
-    private:
-        void ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE deviceId);
-
-    private:
-        void performInplaceFunction(const int kind);
-        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void Clear();
-        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
-        size_t ElemCountFromBufferSize(const size_t totalBufferSize) const;
-        size_t ElemCountFromBufferSize() const;
-        DEVICEID_TYPE PrepareDevice(const DEVICEID_TYPE deviceId = -1) const;
-
-     private:
-
-        size_t m_totalBufferSizeAllocated;
-
-        size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
-        size_t *m_blockIds; //block ids
-        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
-
-        size_t m_expandedSize; // expanded label size
-        size_t* m_block2Id; // label block id to first word location
-        size_t* m_block2UniqId; // label block id to unique first word location        
-
-        mutable void* m_tempHostBuffer; //used to copy values.
-        mutable size_t m_tempHostBufferSize;
-
-        static bool do_sync; 
-    };
-}}}    
-
+﻿//
+// <copyright file="GPUSparseMatrix.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+
+#include "GPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include <functional>
+
+namespace Microsoft { namespace MSR { namespace CNTK {    
+
+    //GPU Sparse Matrix, using cuSPARSE library.
+    //By default we are assuming CSR representation
+    // NOTE m_elemSizeAllocated (in base matrix) means the number of non-zero elements we have allocated space
+    // We are packing the CSR format (pointed to by m_pArray) as follows:
+    // ElemType elements[m_elemSizeAllocated]
+    // int colIdx[m_elemSizeAllocated]
+    // int rowIdxStart[m_numRows+1]
+
+    template<class ElemType>
+    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
+    {
+        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
+
+    public:
+        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
+
+        GPUSparseMatrix(const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR,
+            const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
+    
+        GPUSparseMatrix(const GPUSparseMatrix<ElemType>&);
+
+        GPUSparseMatrix(const GPUMatrix<ElemType>&, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
+
+#ifndef    LINUX
+        GPUSparseMatrix(GPUSparseMatrix<ElemType>&&);
+#endif    /* LINUX */
+
+        ~GPUSparseMatrix();
+
+    public:
+        void Reset();
+
+    public:
+        // return col pointer, which is immediately following the non-zero element
+        // in memory format is always in the following order:
+        // Non-zero data elements, Full index locations, compressed index locations
+        // In CSR row data is compressed, in CSC col data is compressed
+        inline const ElemType* NzValues() const {return m_pArray;}
+        inline ElemType* NzValues() {return m_pArray;}
+        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
+
+        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
+        size_t MajorIndexCount() const { return m_nz; }
+        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
+
+        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
+        size_t SecondaryIndexCount(const size_t numNZ) const 
+        {
+            if (m_format&matrixFormatCompressed)
+            {
+                size_t cnt = (m_format&matrixFormatRowMajor)?m_numRows:m_numCols;
+                if (cnt > 0) cnt++; // add an extra element on the end for the "max" value
+                return cnt;
+            }
+            else
+                return numNZ; // COO format
+        }
+
+        size_t SecondaryIndexCount() const
+        {
+            return SecondaryIndexCount(m_nz);
+        }
+
+        // get size for compressed index
+        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
+
+        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
+        size_t BufferSizeNeeded(const size_t numNZ) const 
+        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
+
+        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
+        inline ElemType* BufferPointer() const { return m_pArray; }
+
+        // the column and row locations will swap based on what format we are in. Full index always follows the data array
+        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
+        size_t RowSize() const {return (m_format&matrixFormatRowMajor)?SecondaryIndexSize():MajorIndexSize();} 
+        GPUSPARSE_INDEX_TYPE* ColLocation() const { return (m_format&matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation(); }
+        size_t ColSize() const {return (m_format&matrixFormatRowMajor)?MajorIndexSize():SecondaryIndexSize();} // actual number of bytes in use
+
+        void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
+        void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+        void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
+        void SetValue(const GPUMatrix<ElemType>& denseMatrix);
+
+        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
+
+        GPUSparseMatrix<ElemType> Transpose() const;
+        void InplaceTranspose();
+        GPUSparseMatrix<ElemType>& AssignTransposeOf(const GPUSparseMatrix<ElemType>& a);
+
+        GPUMatrix<ElemType> CopyToDenseMatrix() const;
+        void CopyToDenseMatrix(GPUMatrix<ElemType> &denseMatrix) const;
+        void CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const;
+        void ChangeDeviceTo(DEVICEID_TYPE toId);
+
+        GPUSparseMatrix<ElemType>& operator=(const GPUSparseMatrix<ElemType>& deepCopy);
+#ifndef    LINUX
+        GPUSparseMatrix<ElemType>& operator=(GPUSparseMatrix<ElemType>&& moveFrom);
+#endif    /* LINUX */
+        GPUSparseMatrix<ElemType> operator+ (const GPUSparseMatrix<ElemType>& a) const;
+        GPUSparseMatrix<ElemType> operator- (const GPUSparseMatrix<ElemType>& a) const;
+        GPUSparseMatrix<ElemType>& operator^= (const ElemType alpha); //element-wise power        
+        GPUSparseMatrix<ElemType> operator^ (const ElemType alpha) const; //element-wise power
+        GPUSparseMatrix<ElemType>& operator*= (const ElemType alpha);
+        GPUSparseMatrix<ElemType> operator*(const ElemType alpha) const;
+        GPUSparseMatrix<ElemType>& AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power);        
+
+        bool IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+    public:
+        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
+        inline size_t GetNumNZElements() const {return m_nz;}
+
+        //Sets sparse matrix in CSR format. this acts as deep copy
+        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
+            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
+        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
+            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
+        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
+        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
+        void GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+
+        void GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+
+        void ConvertToSparseFormat(MatrixFormat newFormat);
+        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
+
+    public:
+        GPUSparseMatrix<ElemType>& ElementInverse ();
+        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceLinearRectifierDerivative();
+        GPUSparseMatrix<ElemType>& AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceSigmoid ();
+        GPUSparseMatrix<ElemType>& AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceTanh ();
+        GPUSparseMatrix<ElemType>& AssignTanhOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceSqrt ();
+        GPUSparseMatrix<ElemType>& AssignSqrtOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceExp ();
+        GPUSparseMatrix<ElemType>& AssignExpOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceLog ();
+        GPUSparseMatrix<ElemType>& AssignLogOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceAbs ();   
+        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+
+        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
+        GPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
+
+        GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
+
+        ElemType SumOfElements () const; //sum of all elements
+        ElemType SumOfAbsElements () const; //sum of all abs(elements)
+        ElemType FrobeniusNorm() const;
+        ElemType MatrixNormInf() const;
+        ElemType MatrixNorm1() const;
+        ElemType MatrixNorm0() const { return (ElemType)GetNumNZElements(); };
+    public:        
+        //Performs C = alpha ∗ op ( S ) ∗ D + beta ∗ C; Where S is sparse and D and C are dense
+        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, 
+            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
+        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& S, const bool transposeS, const GPUMatrix<ElemType>& D, 
+            const bool transposeD, ElemType beta, GPUMatrix<ElemType>& C);
+        static void MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, 
+            const bool transposeB, GPUSparseMatrix<ElemType>& c);
+        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
+        
+        static void ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
+            const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+            const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore);
+        static void ClassEntropyError(GPUSparseMatrix<ElemType>& a);
+        static void ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd);
+        static void ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd);
+
+        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
+        
+        static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
+        static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
+        static void Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &C);
+        GPUSparseMatrix<ElemType>& AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB);
+
+        static ElemType InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
+        static ElemType InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);
+        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
+        static void Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a);
+        static void ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c);
+        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+        static bool AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+
+        //For these two, I should also add a version which would return GPUSparseMatrix, since Dense.*Sparse =Sparse.*Dense=Sparse
+        static GPUMatrix<ElemType> ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
+        static GPUMatrix<ElemType> ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);     
+
+    public:
+        // See: http://stackoverflow.com/questions/4660123/overloading-friend-operator-for-template-class/4661372#4661372
+        template <class ElemTypeDummy>
+        friend MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemTypeDummy>& us);
+        template <class ElemTypeDummy>
+        friend MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemTypeDummy>& us);
+
+     private:
+         void* ReserveTempHostBuffer(const size_t sizeInByte) const;
+         template <class OutType, class InType>
+         static void CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size);
+    private:
+        void ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE deviceId);
+
+    private:
+        void performInplaceFunction(const int kind);
+        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
+        void Clear();
+        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
+        size_t ElemCountFromBufferSize(const size_t totalBufferSize) const;
+        size_t ElemCountFromBufferSize() const;
+        DEVICEID_TYPE PrepareDevice(const DEVICEID_TYPE deviceId = -1) const;
+
+     private:
+
+        size_t m_totalBufferSizeAllocated;
+
+        size_t m_blockSize; //block size        
+        size_t *m_blockIds; //block ids
+        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
+
+        size_t m_expandedSize; // expanded label size
+        size_t* m_block2Id; // label block id to first word location
+        size_t* m_block2UniqId; // label block id to unique first word location        
+
+        mutable void* m_tempHostBuffer; //used to copy values.
+        mutable size_t m_tempHostBufferSize;
+
+        static bool do_sync; 
+    };
+}}}    
+
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index a16b0e73b..b5d13e0b7 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -733,6 +733,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (m_CPUSparseMatrix == nullptr)
                 {
                     m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat); 
+
+                    if (GetMatrixType() == MatrixType::DENSE && m_CPUMatrix != nullptr)
+                    {
+                        m_CPUSparseMatrix->Resize(GetNumRows(), GetNumCols());
+                        CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
+                    }
+                    else
+                    {
+                        // TODO: Assign Sparse from Sparse!
+                    }
+
                     delete m_CPUMatrix;
                     m_CPUMatrix = nullptr;
                 }
@@ -801,6 +812,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    template<class ElemType>
+    void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
+    {
+        foreach_coord(row, col, from)
+        {
+            auto val = from(row, col);
+            dest.SetValue(row, col, val);
+        }
+    }
 
     template<class ElemType>
     ElemType Matrix<ElemType>::Get00Element() const
@@ -3992,7 +4012,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 nullptr,
                 return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
                 return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
-                NOT_IMPLEMENTED; return false ,
+                return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
                 return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold)
                 );                
             }
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 459365f6d..4a6d4c37a 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -1,439 +1,440 @@
-//
-// <copyright file="Matrix.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "CPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include "GPUMatrix.h"
-#include "GPUSparseMatrix.h"
-
-// This class is exported from the Math.dll
-namespace Microsoft { namespace MSR { namespace CNTK {
-    enum CurrentDataLocation
-    {
-        NONE, CPU, GPU, BOTH
-    };
-
-    enum MatrixType
-    { 
-       UNDETERMINED, DENSE, SPARSE
-    };
-
-    //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
-    //convertion is need when passing data between Matrix and C++ matrices
-    //For the best performance compile CNTKMath project with NO_SYNC preprocessor directive
-    //!!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment    
-    template<class ElemType>
-    class MATH_API Matrix 
-    {
-    private:
-        mutable BaseMatrix<ElemType> *m_baseMatrix;
-        mutable GPUMatrix<ElemType> *m_GPUMatrix;
-        mutable CPUMatrix<ElemType> *m_CPUMatrix;
-        mutable GPUSparseMatrix<ElemType> *m_GPUSparseMatrix;
-        mutable CPUSparseMatrix<ElemType> *m_CPUSparseMatrix;
-        mutable MatrixType m_matrixType;
-        mutable CurrentDataLocation m_currentDataLocation; //Indicates which matrix is current        
-        mutable DEVICEID_TYPE m_preferredDeviceId;
-        //Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
-        void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved=true,bool emptyTransfer=false) const; 
-        //Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
-        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
-        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
-
-    public:
-        //Constructors, destructors and other static matrix builders
-        //Each constructor can take deviceId as parameter.
-        //If deviceId<0 then the matrix will be based in RAM (CPUMatrix)
-        //Elseif deviceId>=0 and <AUTOPLACEMATRIX, then the matrix will be based on GPU with specified deviceId
-        //Else (default) if deviceId=AUTOPLACEMATRIX, the class will try to place itself on the best GPU, if fails it will go to CPU
-        //The default behaiviour should be deviceId=AUTOPLACEMATRIX        
-        Matrix(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX); 
-        Matrix(BaseMatrix<ElemType>* baseMatrix, ElemType *pArray, DEVICEID_TYPE deviceId); // constructor for setting Matrix from a base matrix (externally managed butter pArray)
-        Matrix(FILE* f, const char * matrixName, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE); //matrixName is used to verify that correct matrix is read.
-        Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE);
-        Matrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const size_t nnz=0);
-        Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);  //copy constructor, deep copy
-        Matrix<ElemType>& operator=(const Matrix<ElemType>& deepCopyFrom);  //assignment operator, deep copy
-        Matrix(Matrix<ElemType>&& moveFrom);  //move constructor, shallow copy
-        Matrix<ElemType>& operator=(Matrix<ElemType>&& moveFrom);  //move coment operator, shallow copy
-
-        static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        void Clear();
-        ~Matrix();
-
-    private:
-        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        void Init(DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        void SetDataLocation(CurrentDataLocation location, MatrixType type=UNDETERMINED) const;
-
-    public:
-        MatrixType GetMatrixType() const {return m_matrixType;};
-        bool OwnBuffer() const {return m_baseMatrix->OwnBuffer();}
-        int GetDeviceId() const; //-1 if CPU, otherwise GPU CUDA device id
-        DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; //-1 if CPU, otherwise GPU CUDA device id
-        void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId){ if (m_preferredDeviceId != preferredDeviceId) m_preferredDeviceId = preferredDeviceId; }
-        //Moves matrix from device id_from to device with id_to. 
-        //If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
-        void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved=false, bool emptyTransfer=false, bool updatePreferredDevice=true) const; 
-        CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
-        void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat = matrixFormatSparseCSR); //sets matrix type between dense and sparse
-        size_t GetNumRows() const;
-        size_t GetNumCols() const;
-        size_t GetNumElements() const;
-        wchar_t* GetMatrixName() const;
-        void SetMatrixName(const wchar_t* s);
-        bool IsEmpty() const;  
-        size_t BufferSize() const;
-        ElemType* BufferPointer() const;
-        size_t NzCount() const;
-
-        ElemType* CopyToArray() const; //allocated by the callee but need to be deleted by the caller
-        size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const;  //allocated by the callee but need to be deleted by the caller
-
-        Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
-        Matrix<ElemType>& AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
-
-        void ShiftBy(int numShift) ;
-
-        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
-        void Adagrad(Matrix<ElemType>& gradients);
-        void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
-       
-        void Reshape(const size_t numRows, const size_t numCols);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true);  //by default we only reallocate if need to grow        
-        size_t GetAllocatedSize() const;
-        void Reset(); //reset for sparse matrix
-
-        const ElemType operator() (const size_t row, const size_t col) const;
-        ElemType& operator() (const size_t row, const size_t col);
-        ElemType Get00Element() const;
-
-        void SetValue(const ElemType v);
-        void SetValue(const DeviceBoundNumber<ElemType>& db_number);
-        void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
-        void SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, int deviceId=MANAGEDEXTERN);
-        void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
-        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-            const size_t nz, const size_t numRows, const size_t numCols);
-        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
-        void SetColumn(const ElemType* colPointer, size_t colInd);
-        void SetColumn(const ElemType val, size_t colInd);
-        void SetColumn(const Matrix<ElemType>& valMat, size_t colInd);
-
-        void SetDiagonalValue(const ElemType v);
-        void SetDiagonalValue(Matrix<ElemType>& vector);
-        void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED);
-        void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
-        void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed=USE_TIME_BASED_SEED); 
-        void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
-
-        Matrix<ElemType> Transpose();
-        Matrix<ElemType>& AssignTransposeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& operator+= (const ElemType alpha);
-        Matrix<ElemType> operator+ (const ElemType alpha) const;
-        Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& operator+= (const Matrix<ElemType>& a);
-        Matrix<ElemType> operator+ (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& operator-= (const ElemType alpha);
-        Matrix<ElemType> operator- (const ElemType alpha) const;
-        Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
-        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);
-
-        Matrix<ElemType>& operator-= (const Matrix<ElemType>& a);
-        Matrix<ElemType> operator- (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& operator*= (const ElemType alpha);
-        Matrix<ElemType> operator* (const ElemType alpha) const;
-        Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);
-
-        Matrix<ElemType> operator* (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignProductOf (const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB);
-
-        Matrix<ElemType>& operator/= (ElemType alpha);
-        Matrix<ElemType> operator/ (ElemType alpha) const;        
-
-        Matrix<ElemType>& operator^= (ElemType alpha); //element-wise power
-        Matrix<ElemType> operator^ (ElemType alpha) const; //element-wise power
-        Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);
-
-        Matrix<ElemType>& ElementMultiplyWith (const Matrix<ElemType>& a);
-        Matrix<ElemType>& AssignElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& AddElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& AssignElementDivisionOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& ElementDivideBy(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ColumnElementMultiplyWith(const Matrix<ElemType>& a);
-        Matrix<ElemType>& RowElementMultiplyWith(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ColumnElementDivideBy(const Matrix<ElemType>& a);
-        Matrix<ElemType>& RowElementDivideBy(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ElementInverse ();
-        Matrix<ElemType>& AssignElementInverseOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLinearRectifierDerivative();
-        Matrix<ElemType>& AssignLinearRectifierDerivativeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceSigmoidDerivative();
-        Matrix<ElemType>& AssignSigmoidDerivativeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceSigmoid ();
-        Matrix<ElemType>& AssignSigmoidOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceTanh ();
-        Matrix<ElemType>& AssignTanhOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLogSoftmax (const bool isColWise);
-        Matrix<ElemType>& AssignLogSoftmaxOf (const Matrix<ElemType>& a, const bool isColWise);
-
-        Matrix<ElemType>& InplaceSqrt ();
-        Matrix<ElemType>& AssignSqrtOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceExp ();
-        Matrix<ElemType>& AssignExpOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLog ();
-        Matrix<ElemType>& AssignLogOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceCosine ();
-        Matrix<ElemType>& AssignCosineOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceNegativeSine ();
-        Matrix<ElemType>& AssignNegativeSineOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLog10 ();
-        Matrix<ElemType>& AssignLog10Of (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceAbs ();
-        Matrix<ElemType>& AssignAbsOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
-        Matrix<ElemType>& AssignTruncateBottomOf (const Matrix<ElemType>& a, const ElemType threshold);
-        Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
-        Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
-        Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
-
-        Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
-
-        DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
-        ElemType SumOfAbsElements () const; //sum of all abs(elements)
-        ElemType SumOfElements () const; //sum of all elements
-        Matrix<ElemType>& AssignSumOfElements(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
-        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
-        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
-
-        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
-
-        bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-
-        void VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise);
-
-        void VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise);
-
-        void VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise);
-
-        Matrix<ElemType>& AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise);
-        Matrix<ElemType>& AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn);
-
-        Matrix<ElemType>& AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a);
-
-        ElemType FrobeniusNorm() const;
-        Matrix<ElemType>& AssignFrobeniusNormOf(const Matrix<ElemType>& a);
-
-        ElemType MatrixNormInf() const;
-        ElemType MatrixNorm1() const;
-        ElemType MatrixNorm0() const; //number of non-zero elemets
-        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
-        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
-        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
-
-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
-
-        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
-
-        bool HasNan (const char * name) const;
-        size_t CountNanInf() const;
-
-        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
-        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
-
-        Matrix<ElemType>& AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                                 const bool zeroPadding = false); 
-        Matrix<ElemType>& UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                                 const bool zeroPadding = false) const; 
-        Matrix<ElemType>& AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch, 
-                                                 const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch, 
-                                                 const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-    public:
-        ElemType Exp10(ElemType num); 
-        ElemType Mod(ElemType x , ElemType y);
-        ElemType LogAdd(ElemType x, ElemType y);
-
-    public:
-        static DEVICEID_TYPE GetBestGPUDeviceId(); //{ return GPUMatrix<ElemType>::GetBestGPUDeviceId();}
-
-        //static BLAS functions
-
-        // singular value decomposition of A as A = U*SIGMA*VT
-        static void SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT);
-
-        static void MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, 
-            ElemType beta, Matrix<ElemType>& c);
-        static void MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
-        static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
-        static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-
-        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
-        static void AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-
-        static void AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-        //static void AddLogElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-        static void AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-
-        static void Scale(ElemType alpha, Matrix<ElemType>& a);
-        static void Scale(Matrix<ElemType>& alpha, Matrix<ElemType>& a); //In this case Matrix alpha must be 1x1
-        static void Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-        static void InnerProduct (const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise);
-        static ElemType InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        static void ElementWisePower (ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-
-        static bool AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold = 1e-8);       
-
-    public:
-        friend File& operator>>(File& stream, Matrix<ElemType>& M)
-        {
-            char type;
-            stream>>type;
-            if (type=='d')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    if (M.m_CPUMatrix==NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
-                    stream>>(*M.m_CPUMatrix);
-                    M.SetDataLocation(CPU, DENSE);
-                }
-                else
-                {
-                    if (M.m_GPUMatrix==NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
-                    stream>>(*M.m_GPUMatrix);  
-                    M.SetDataLocation(GPU, DENSE);
-                }                
-            }
-            else if (type=='s')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
-                }
-                else
-                {
-                    if (M.m_GPUSparseMatrix==NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
-                    stream>>(*M.m_GPUSparseMatrix); 
-                    M.SetDataLocation(GPU, SPARSE);
-                }                
-            }
-            else
-                LogicError("wrong matrix type!");
-            return stream;
-
-        }
-        friend File& operator<<(File& stream, const Matrix<ElemType>& M)
-        {
-            if (M.GetMatrixType()==MatrixType::DENSE)
-            {
-                stream<<'d';
-                if (M.GetDeviceId()<0)
-                {
-                    stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUMatrix);
-                }                
-            }
-            else
-            {
-                stream<<'s';
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;
-                    //stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUSparseMatrix);
-                }           
-            }
-            return stream;
-        }
-
-    public:
-        static void ClassEntropy(const Matrix<ElemType>& a, const Matrix<ElemType>& weight,
-            const Matrix<ElemType> & label, const Matrix<ElemType>* cls, 
-            const Matrix<ElemType>* idx2cls, Matrix<ElemType>& etp, Matrix<ElemType>& entropyScore);
-        static void ClassEntropyError(const Matrix<ElemType>& a);
-        static void ClassEntropyGradientOfInput(const Matrix<ElemType>& error, const Matrix<ElemType>& weight, Matrix<ElemType>& grd);
-        static void ClassEntropyGradientOfWeight(
-            const Matrix<ElemType>& error, 
-            const Matrix<ElemType>& input,
-            const Matrix<ElemType>& weight,
-            const Matrix<ElemType> & label, 
-            const Matrix<ElemType>* cls, 
-            const Matrix<ElemType>* idx2cls, 
-            Matrix<ElemType>& grd);
-
-    };
-
-    typedef Matrix<float> SingleMatrix;
-    typedef Matrix<double> DoubleMatrix;
-}}}
+//
+// <copyright file="Matrix.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+
+#include "CPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include "GPUMatrix.h"
+#include "GPUSparseMatrix.h"
+
+// This class is exported from the Math.dll
+namespace Microsoft { namespace MSR { namespace CNTK {
+    enum CurrentDataLocation
+    {
+        NONE, CPU, GPU, BOTH
+    };
+
+    enum MatrixType
+    { 
+       UNDETERMINED, DENSE, SPARSE
+    };
+
+    //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
+    //convertion is need when passing data between Matrix and C++ matrices
+    //For the best performance compile CNTKMath project with NO_SYNC preprocessor directive
+    //!!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment    
+    template<class ElemType>
+    class MATH_API Matrix 
+    {
+    private:
+        mutable BaseMatrix<ElemType> *m_baseMatrix;
+        mutable GPUMatrix<ElemType> *m_GPUMatrix;
+        mutable CPUMatrix<ElemType> *m_CPUMatrix;
+        mutable GPUSparseMatrix<ElemType> *m_GPUSparseMatrix;
+        mutable CPUSparseMatrix<ElemType> *m_CPUSparseMatrix;
+        mutable MatrixType m_matrixType;
+        mutable CurrentDataLocation m_currentDataLocation; //Indicates which matrix is current        
+        mutable DEVICEID_TYPE m_preferredDeviceId;
+        //Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
+        void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved=true,bool emptyTransfer=false) const; 
+        //Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
+        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
+        static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
+
+    public:
+        //Constructors, destructors and other static matrix builders
+        //Each constructor can take deviceId as parameter.
+        //If deviceId<0 then the matrix will be based in RAM (CPUMatrix)
+        //Elseif deviceId>=0 and <AUTOPLACEMATRIX, then the matrix will be based on GPU with specified deviceId
+        //Else (default) if deviceId=AUTOPLACEMATRIX, the class will try to place itself on the best GPU, if fails it will go to CPU
+        //The default behaiviour should be deviceId=AUTOPLACEMATRIX        
+        Matrix(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX); 
+        Matrix(BaseMatrix<ElemType>* baseMatrix, ElemType *pArray, DEVICEID_TYPE deviceId); // constructor for setting Matrix from a base matrix (externally managed butter pArray)
+        Matrix(FILE* f, const char * matrixName, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE); //matrixName is used to verify that correct matrix is read.
+        Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE);
+        Matrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const size_t nnz=0);
+        Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);  //copy constructor, deep copy
+        Matrix<ElemType>& operator=(const Matrix<ElemType>& deepCopyFrom);  //assignment operator, deep copy
+        Matrix(Matrix<ElemType>&& moveFrom);  //move constructor, shallow copy
+        Matrix<ElemType>& operator=(Matrix<ElemType>&& moveFrom);  //move coment operator, shallow copy
+
+        static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        void Clear();
+        ~Matrix();
+
+    private:
+        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        void Init(DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        void SetDataLocation(CurrentDataLocation location, MatrixType type=UNDETERMINED) const;
+
+    public:
+        MatrixType GetMatrixType() const {return m_matrixType;};
+        bool OwnBuffer() const {return m_baseMatrix->OwnBuffer();}
+        int GetDeviceId() const; //-1 if CPU, otherwise GPU CUDA device id
+        DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; //-1 if CPU, otherwise GPU CUDA device id
+        void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId){ if (m_preferredDeviceId != preferredDeviceId) m_preferredDeviceId = preferredDeviceId; }
+        //Moves matrix from device id_from to device with id_to. 
+        //If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
+        void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved=false, bool emptyTransfer=false, bool updatePreferredDevice=true) const; 
+        CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
+        void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat = matrixFormatSparseCSR); //sets matrix type between dense and sparse
+        size_t GetNumRows() const;
+        size_t GetNumCols() const;
+        size_t GetNumElements() const;
+        wchar_t* GetMatrixName() const;
+        void SetMatrixName(const wchar_t* s);
+        bool IsEmpty() const;  
+        size_t BufferSize() const;
+        ElemType* BufferPointer() const;
+        size_t NzCount() const;
+
+        ElemType* CopyToArray() const; //allocated by the callee but need to be deleted by the caller
+        size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const;  //allocated by the callee but need to be deleted by the caller
+
+        Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
+        Matrix<ElemType>& AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
+
+        void ShiftBy(int numShift) ;
+
+        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
+        void Adagrad(Matrix<ElemType>& gradients);
+        void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
+       
+        void Reshape(const size_t numRows, const size_t numCols);
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true);  //by default we only reallocate if need to grow        
+        size_t GetAllocatedSize() const;
+        void Reset(); //reset for sparse matrix
+
+        const ElemType operator() (const size_t row, const size_t col) const;
+        ElemType& operator() (const size_t row, const size_t col);
+        ElemType Get00Element() const;
+
+        void SetValue(const ElemType v);
+        void SetValue(const DeviceBoundNumber<ElemType>& db_number);
+        void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
+        void SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, int deviceId=MANAGEDEXTERN);
+        void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
+        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
+            const size_t nz, const size_t numRows, const size_t numCols);
+        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
+        void SetColumn(const ElemType* colPointer, size_t colInd);
+        void SetColumn(const ElemType val, size_t colInd);
+        void SetColumn(const Matrix<ElemType>& valMat, size_t colInd);
+
+        void SetDiagonalValue(const ElemType v);
+        void SetDiagonalValue(Matrix<ElemType>& vector);
+        void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED);
+        void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
+        void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed=USE_TIME_BASED_SEED); 
+        void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
+
+        Matrix<ElemType> Transpose();
+        Matrix<ElemType>& AssignTransposeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& operator+= (const ElemType alpha);
+        Matrix<ElemType> operator+ (const ElemType alpha) const;
+        Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& operator+= (const Matrix<ElemType>& a);
+        Matrix<ElemType> operator+ (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& operator-= (const ElemType alpha);
+        Matrix<ElemType> operator- (const ElemType alpha) const;
+        Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
+        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);
+
+        Matrix<ElemType>& operator-= (const Matrix<ElemType>& a);
+        Matrix<ElemType> operator- (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& operator*= (const ElemType alpha);
+        Matrix<ElemType> operator* (const ElemType alpha) const;
+        Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);
+
+        Matrix<ElemType> operator* (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignProductOf (const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB);
+
+        Matrix<ElemType>& operator/= (ElemType alpha);
+        Matrix<ElemType> operator/ (ElemType alpha) const;        
+
+        Matrix<ElemType>& operator^= (ElemType alpha); //element-wise power
+        Matrix<ElemType> operator^ (ElemType alpha) const; //element-wise power
+        Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);
+
+        Matrix<ElemType>& ElementMultiplyWith (const Matrix<ElemType>& a);
+        Matrix<ElemType>& AssignElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& AddElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& AssignElementDivisionOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& ElementDivideBy(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ColumnElementMultiplyWith(const Matrix<ElemType>& a);
+        Matrix<ElemType>& RowElementMultiplyWith(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ColumnElementDivideBy(const Matrix<ElemType>& a);
+        Matrix<ElemType>& RowElementDivideBy(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ElementInverse ();
+        Matrix<ElemType>& AssignElementInverseOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLinearRectifierDerivative();
+        Matrix<ElemType>& AssignLinearRectifierDerivativeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceSigmoidDerivative();
+        Matrix<ElemType>& AssignSigmoidDerivativeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceSigmoid ();
+        Matrix<ElemType>& AssignSigmoidOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceTanh ();
+        Matrix<ElemType>& AssignTanhOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLogSoftmax (const bool isColWise);
+        Matrix<ElemType>& AssignLogSoftmaxOf (const Matrix<ElemType>& a, const bool isColWise);
+
+        Matrix<ElemType>& InplaceSqrt ();
+        Matrix<ElemType>& AssignSqrtOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceExp ();
+        Matrix<ElemType>& AssignExpOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLog ();
+        Matrix<ElemType>& AssignLogOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceCosine ();
+        Matrix<ElemType>& AssignCosineOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceNegativeSine ();
+        Matrix<ElemType>& AssignNegativeSineOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLog10 ();
+        Matrix<ElemType>& AssignLog10Of (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceAbs ();
+        Matrix<ElemType>& AssignAbsOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
+        Matrix<ElemType>& AssignTruncateBottomOf (const Matrix<ElemType>& a, const ElemType threshold);
+        Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
+        Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
+        Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
+
+        Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
+
+        DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
+        ElemType SumOfAbsElements () const; //sum of all abs(elements)
+        ElemType SumOfElements () const; //sum of all elements
+        Matrix<ElemType>& AssignSumOfElements(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
+        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
+        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+
+        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
+
+        bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+
+        void VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise);
+
+        void VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise);
+
+        void VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise);
+
+        Matrix<ElemType>& AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise);
+        Matrix<ElemType>& AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn);
+
+        Matrix<ElemType>& AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a);
+
+        ElemType FrobeniusNorm() const;
+        Matrix<ElemType>& AssignFrobeniusNormOf(const Matrix<ElemType>& a);
+
+        ElemType MatrixNormInf() const;
+        ElemType MatrixNorm1() const;
+        ElemType MatrixNorm0() const; //number of non-zero elemets
+        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
+        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
+        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
+        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
+
+        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
+
+        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
+
+        bool HasNan (const char * name) const;
+        size_t CountNanInf() const;
+
+        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
+        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
+
+        Matrix<ElemType>& AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                                 const bool zeroPadding = false); 
+        Matrix<ElemType>& UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                                 const bool zeroPadding = false) const; 
+        Matrix<ElemType>& AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch, 
+                                                 const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch, 
+                                                 const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+    public:
+        ElemType Exp10(ElemType num); 
+        ElemType Mod(ElemType x , ElemType y);
+        ElemType LogAdd(ElemType x, ElemType y);
+
+    public:
+        static DEVICEID_TYPE GetBestGPUDeviceId(); //{ return GPUMatrix<ElemType>::GetBestGPUDeviceId();}
+
+        //static BLAS functions
+
+        // singular value decomposition of A as A = U*SIGMA*VT
+        static void SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT);
+
+        static void MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, 
+            ElemType beta, Matrix<ElemType>& c);
+        static void MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
+        static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
+        static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
+        static void AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        static void AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+        //static void AddLogElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+        static void AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+
+        static void Scale(ElemType alpha, Matrix<ElemType>& a);
+        static void Scale(Matrix<ElemType>& alpha, Matrix<ElemType>& a); //In this case Matrix alpha must be 1x1
+        static void Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+        static void InnerProduct (const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise);
+        static ElemType InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        static void ElementWisePower (ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+
+        static bool AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold = 1e-8);       
+
+    public:
+        friend File& operator>>(File& stream, Matrix<ElemType>& M)
+        {
+            char type;
+            stream>>type;
+            if (type=='d')
+            {
+                if (M.GetDeviceId()<0)
+                {
+                    if (M.m_CPUMatrix==NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
+                    stream>>(*M.m_CPUMatrix);
+                    M.SetDataLocation(CPU, DENSE);
+                }
+                else
+                {
+                    if (M.m_GPUMatrix==NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
+                    stream>>(*M.m_GPUMatrix);  
+                    M.SetDataLocation(GPU, DENSE);
+                }                
+            }
+            else if (type=='s')
+            {
+                if (M.GetDeviceId()<0)
+                {
+                    NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
+                }
+                else
+                {
+                    if (M.m_GPUSparseMatrix==NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
+                    stream>>(*M.m_GPUSparseMatrix); 
+                    M.SetDataLocation(GPU, SPARSE);
+                }                
+            }
+            else
+                LogicError("wrong matrix type!");
+            return stream;
+
+        }
+        friend File& operator<<(File& stream, const Matrix<ElemType>& M)
+        {
+            if (M.GetMatrixType()==MatrixType::DENSE)
+            {
+                stream<<'d';
+                if (M.GetDeviceId()<0)
+                {
+                    stream<<(*M.m_CPUMatrix);
+                }
+                else
+                {
+                    stream<<(*M.m_GPUMatrix);
+                }                
+            }
+            else
+            {
+                stream<<'s';
+                if (M.GetDeviceId()<0)
+                {
+                    NOT_IMPLEMENTED;
+                    //stream<<(*M.m_CPUMatrix);
+                }
+                else
+                {
+                    stream<<(*M.m_GPUSparseMatrix);
+                }           
+            }
+            return stream;
+        }
+
+    public:
+        static void ClassEntropy(const Matrix<ElemType>& a, const Matrix<ElemType>& weight,
+            const Matrix<ElemType> & label, const Matrix<ElemType>* cls, 
+            const Matrix<ElemType>* idx2cls, Matrix<ElemType>& etp, Matrix<ElemType>& entropyScore);
+        static void ClassEntropyError(const Matrix<ElemType>& a);
+        static void ClassEntropyGradientOfInput(const Matrix<ElemType>& error, const Matrix<ElemType>& weight, Matrix<ElemType>& grd);
+        static void ClassEntropyGradientOfWeight(
+            const Matrix<ElemType>& error, 
+            const Matrix<ElemType>& input,
+            const Matrix<ElemType>& weight,
+            const Matrix<ElemType> & label, 
+            const Matrix<ElemType>* cls, 
+            const Matrix<ElemType>* idx2cls, 
+            Matrix<ElemType>& grd);
+
+    };
+
+    typedef Matrix<float> SingleMatrix;
+    typedef Matrix<double> DoubleMatrix;
+}}}