Merge branch 'master' of https://git01.codeplex.com/cntk

Conflicts: DataReader/HTKMLFReader/HTKMLFReader.cpp DataReader/HTKMLFReader/HTKMLFReader.h
2015-02-06 16:14:47 -08:00 · 2015-02-06 16:14:47 -08:00 · b4f465b8c4
--- a/.gitignore
+++ b/.gitignore
@ -1,162 +1,168 @@
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.sln.docstates
-*.orig
-
-# Build results
-
-[Dd]ebug/
-[Rr]elease/
-x64/
-build/
-[Bb]in/
-[Oo]bj/
-
-# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
-!packages/*/build/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-*_i.c
-*_p.c
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.log
-*.scc
-*.dep
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opensdf
-*.sdf
-*.cachefile
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-*.ncrunch*
-.*crunch*.local.xml
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.Publish.xml
-
-# NuGet Packages Directory
-## TODO: If you have NuGet Package Restore enabled, uncomment the next line
-#packages/
-
-# Windows Azure Build Output
-csx
-*.build.csdef
-
-# Windows Store app package directory
-AppPackages/
-
-# Others
-sql/
-*.Cache
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.[Pp]ublish.xml
-*.pfx
-*.publishsettings
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file to a newer
-# Visual Studio version. Backup files are not needed, because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-App_Data/*.mdf
-App_Data/*.ldf
-
-
-#LightSwitch generated files
-GeneratedArtifacts/
-_Pvt_Extensions/
-ModelManifest.xml
-
-# =========================
-# Windows detritus
-# =========================
-
-# Windows image file caches
-Thumbs.db
-ehthumbs.db
-
-# Folder config file
-Desktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Mac desktop service store files
-.DS_Store
-
-*.lyx~
-*.bak
-*.lyx#
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+*.orig
+
+# Build results
+
+[Dd]ebug/
+[Rr]elease/
+x64/
+build/
+[Bb]in/
+[Oo]bj/
+
+# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
+!packages/*/build/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.log
+*.scc
+*.dep
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+*.ncrunch*
+.*crunch*.local.xml
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.Publish.xml
+
+# NuGet Packages Directory
+## TODO: If you have NuGet Package Restore enabled, uncomment the next line
+#packages/
+
+# Windows Azure Build Output
+csx
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Others
+sql/
+*.Cache
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.[Pp]ublish.xml
+*.pfx
+*.publishsettings
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+App_Data/*.mdf
+App_Data/*.ldf
+
+
+#LightSwitch generated files
+GeneratedArtifacts/
+_Pvt_Extensions/
+ModelManifest.xml
+
+# =========================
+# Windows detritus
+# =========================
+
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Mac desktop service store files
+.DS_Store
+
+*.lyx~
+*.bak
+*.lyx#
+
+# =========================
+# prebuild file 
+# =========================
+MachineLearning/cn/buildinfo.h
+
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@ -1,279 +1,280 @@
-//
-// <copyright file="ConfigFile.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// ConfigFile.cpp : Defines the configuration file loader.
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "File.h"
-#include "commandArgUtil.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-
-    // ParseCommandLine - parse the command line parameters
-    // argc - count of arguments
-    // argv - array of argument parameters
-    // config - config to return
-    std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
-    {
-        config.SetName(std::string("global"));
-        
-        // This vector keeps track of the config files we have already read
-        std::vector<std::string> resolvedConfigFiles;
-        std::string configString;
-        
-        // start at 1, because 0 is the name of the EXE
-        for (int i=1; i < argc; ++i)
-        {
-            wstring str = argv[i];
-
-            // see if they are loading a config file
-            wstring configDescriptor = L"configFile=";
-            int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
-
-            // no config file, parse as regular argument
-            if (compare)
-            {
-                configString += (msra::strfun::utf8(str) + "\n");
-            }
-            else // One or more config file paths specified in a "+"-separated list.
-            {
-                const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
-                std::vector<std::string> filePathsVec = msra::strfun::split(filePaths, "+");
-                for (auto filePath : filePathsVec)
-                {
-                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
-                    {
-                        // if haven't already read this file, read it
-                        resolvedConfigFiles.push_back(filePath);
-                        configString += config.ReadConfigFile(filePath);
-                    }
-                    else
-                       RuntimeError("Cannot specify same config file multiple times at the command line.");
-                }
-            }
-        }
-        
-        configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
-        config.FileParse(configString);
-        return configString;
-    }
-
-    // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
-    //     form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
-    //     If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
-    //     and includes the body of each file in the string which is eventually returned by this function.  If the included
-    //     config file includes other config files, this function will recursively include those files as well.
-    // configString - the config string within which to look for "include" statements
-    // resolvedConfigFiles - the paths to all the config files that have already been resolved.  This vector is used to prevent include loops,
-    //     and to prevent files from being included multiple times.
-    // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
-    std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector<std::string> &resolvedConfigFiles)
-    {
-        std::vector<std::string> lines = msra::strfun::split(configString, "\n");
-        std::string includeKeyword = "include=";
-        std::size_t includeKeywordSize = includeKeyword.size();
-        std::string newConfigString;
-        for (std::string line : lines)
-        {
-            if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
-            {
-                std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
-                if (filePaths.find(openBraceVar) != std::string::npos)
-                {
-                    RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
-                }
-
-                std::vector<std::string> filePathVec = msra::strfun::split (filePaths, "+");
-                for (auto filePath : filePathVec)
-                {
-                    // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
-                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
-                    {
-                        // Recursively resolve the include statements in the included config files.
-                        // Ensure that the same config file isn't included twice, by keeping track of the config
-                        // files that have already been resolved in the resolvedPaths vector.
-                        resolvedConfigFiles.push_back(filePath);
-                        newConfigString += ResolveIncludeStatements(
-                                               ReadConfigFile(filePath), 
-                                               resolvedConfigFiles
-                                           );
-                    }
-                    else
-                    {
-                        // We already resolved this path.  Write a warning so that user is aware of this.
-                        // TODO: This message is written to stderr before stderr gets redirected to the specified file.  Fix this.
-                        fprintf(stderr, "Warning: Config file included multiple times.  Not including config file again: %s", filePath.c_str());
-                    }
-                }
-            }
-            else
-            {
-                newConfigString += (line + "\n");
-            }
-        }
-        return newConfigString;
-    }
-
-    // LoadConfigFiles - load multiple configuration file, and adds to config parameters
-    // filePaths - A "+" delimited list of file paths, corresponding to config files to load
-    // configStringToAppend - A config string which should be processed together with the config files
-    void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
-    {
-        std::string configString = ReadConfigFiles(filePaths);
-        if(configStringToAppend != nullptr)
-        {
-            configString += *configStringToAppend;
-        }
-
-        FileParse(configString);
-    }
-
-    // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
-    //     If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
-    //     then this method will resolve those variables.  This method is meant for the processing of NDL/MEL config files,
-    //     in order to allow them to access variables defined in the primary config file via $varName$ syntax.
-    // filePath - filePath to the file to load
-    // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
-    void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
-    {
-        // read file, resolve variables, and then parse.
-        std::string fileContents = ReadConfigFile(filePath);
-        fileContents = config.ResolveVariables(fileContents);
-        FileParse(fileContents);
-    }
-
-    // LoadConfigFile - load a configuration file, and add to config parameters
-    // filePath - filePath to the file to read
-    void ConfigParser::LoadConfigFile(const std::wstring &filePath)
-    {
-        // read and then parse
-        FileParse(ReadConfigFile(filePath));
-    }
-    
-    // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
-    std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
-    {
-        return ReadConfigFiles(msra::strfun::utf16(filePaths));
-    }
-
-    // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
-    // filePaths - A "+" delimited list of file paths, corresponding to config files to read
-    // returns: a string with the concatentated file contents
-    std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
-    {
-        std::string configString;
-        std::vector<std::wstring> filePathVec = msra::strfun::split (filePaths, L"+");
-        for (auto filePath : filePathVec)
-        {
-            configString += ReadConfigFile(filePath);
-        }
-        return configString;
-    }
-
-    // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
-    std::string ConfigParser::ReadConfigFile(const std::string &filePath)
-    {
-        return ReadConfigFile(msra::strfun::utf16(filePath));
-    }
-
-    // ReadConfigFile - read a configuration file, and return as a string
-    // filePath - the path to the config file to read
-    // returns: a string with the concatentated file contents
-    std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
-    {
-        File file(filePath, fileOptionsRead);
-
-        // initialize with file name
-        std::string path = msra::strfun::utf8(filePath);
-        auto location = path.find_last_of("/\\");
-        if (location != npos)
-            path = path.substr(location+1);
-        m_configName = move(path);
-
-        // read the entire file into a string
-        // CONSIDER: should the File API support this, instead of line by line?
-        size_t fileLength = file.Size();
-        string str;
-        string configFile;
-        configFile.reserve(fileLength);
-        while (!file.IsEOF())
-        {
-            file.GetLine(str);
-            str = PreprocessConfigLine(str);
-            if (str != "")
-            {
-                configFile.append(str);
-                configFile.append("\n");
-            }
-        }
-        return configFile;
-    }
-
-    // GetFileConfigNames - determine the names of the features and labels sections in the config file
-    // features - [in,out] a vector of feature name strings
-    // labels - [in,out] a vector of label name strings
-    void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels)
-    {
-        for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
-        {
-            auto pair = *iter;
-            ConfigParameters temp (iter->second);
-            // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
-            if (temp.ExistsCurrent("dim"))
-            {
-                if (temp.ExistsCurrent("labelMappingFile") 
-                    || temp.ExistsCurrent("labelDim")
-                    || temp.ExistsCurrent("labelType")
-                    || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
-                {
-                    labels.push_back(msra::strfun::utf16(iter->first));
-                }
-                else
-                {
-                    features.push_back(msra::strfun::utf16(iter->first));
-                }
-            }
-        }
-    }
-
-    // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
-    // config - configuration to search
-    // key - string we ar searching for in each config section
-    // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
-    void FindConfigNames(const ConfigParameters& config, std::string key, std::vector<std::wstring>& names)
-    {
-        for (auto iter = config.begin(); iter != config.end(); ++iter)
-        {
-            auto pair = *iter;
-            ConfigParameters temp (iter->second);
-            // see if we have a config parameters that contains a "key" element, if so use it
-            if (temp.ExistsCurrent(key))
-            {
-                names.push_back(msra::strfun::utf16(iter->first));
-            }
-        }
-    }
-
-    // Trim - trim white space off the start and end of the string
-    // str - string to trim
-    // NOTE: if the entire string is empty, then the string will be set to an empty string
-    void Trim(std::string& str)
-    {
-        auto found = str.find_first_not_of(" \t");
-        if (found == npos)
-        {
-            str.erase(0);
-            return;
-        }
-        str.erase(0, found);
-        found = str.find_last_not_of(" \t");
-        if (found != npos)
-            str.erase(found+1);
-    }
-
+//
+// <copyright file="ConfigFile.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// ConfigFile.cpp : Defines the configuration file loader.
+//
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
+#include "File.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+    // ParseCommandLine - parse the command line parameters
+    // argc - count of arguments
+    // argv - array of argument parameters
+    // config - config to return
+    std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
+    {
+        config.SetName(std::string("global"));
+        
+        // This vector keeps track of the config files we have already read
+        std::vector<std::string> resolvedConfigFiles;
+        std::string configString;
+        
+        // start at 1, because 0 is the name of the EXE
+        for (int i=1; i < argc; ++i)
+        {
+            wstring str = argv[i];
+
+            // see if they are loading a config file
+            wstring configDescriptor = L"configFile=";
+            int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
+
+            // no config file, parse as regular argument
+            if (compare)
+            {
+                configString += (msra::strfun::utf8(str) + "\n");
+            }
+            else // One or more config file paths specified in a "+"-separated list.
+            {
+                const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
+                std::vector<std::string> filePathsVec = msra::strfun::split(filePaths, "+");
+                for (auto filePath : filePathsVec)
+                {
+                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+                    {
+                        // if haven't already read this file, read it
+                        resolvedConfigFiles.push_back(filePath);
+                        configString += config.ReadConfigFile(filePath);
+                    }
+                    else
+                       RuntimeError("Cannot specify same config file multiple times at the command line.");
+                }
+            }
+        }
+        
+        configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
+        config.FileParse(configString);
+        return configString;
+    }
+
+    // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
+    //     form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
+    //     If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
+    //     and includes the body of each file in the string which is eventually returned by this function.  If the included
+    //     config file includes other config files, this function will recursively include those files as well.
+    // configString - the config string within which to look for "include" statements
+    // resolvedConfigFiles - the paths to all the config files that have already been resolved.  This vector is used to prevent include loops,
+    //     and to prevent files from being included multiple times.
+    // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
+    std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector<std::string> &resolvedConfigFiles)
+    {
+        std::vector<std::string> lines = msra::strfun::split(configString, "\n");
+        std::string includeKeyword = "include=";
+        std::size_t includeKeywordSize = includeKeyword.size();
+        std::string newConfigString;
+        for (std::string line : lines)
+        {
+            if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
+            {
+                std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
+                if (filePaths.find(openBraceVar) != std::string::npos)
+                {
+                    RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
+                }
+
+                std::vector<std::string> filePathVec = msra::strfun::split (filePaths, "+");
+                for (auto filePath : filePathVec)
+                {
+                    // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
+                    if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+                    {
+                        // Recursively resolve the include statements in the included config files.
+                        // Ensure that the same config file isn't included twice, by keeping track of the config
+                        // files that have already been resolved in the resolvedPaths vector.
+                        resolvedConfigFiles.push_back(filePath);
+                        newConfigString += ResolveIncludeStatements(
+                                               ReadConfigFile(filePath), 
+                                               resolvedConfigFiles
+                                           );
+                    }
+                    else
+                    {
+                        // We already resolved this path.  Write a warning so that user is aware of this.
+                        // TODO: This message is written to stderr before stderr gets redirected to the specified file.  Fix this.
+                        fprintf(stderr, "Warning: Config file included multiple times.  Not including config file again: %s", filePath.c_str());
+                    }
+                }
+            }
+            else
+            {
+                newConfigString += (line + "\n");
+            }
+        }
+        return newConfigString;
+    }
+
+    // LoadConfigFiles - load multiple configuration file, and adds to config parameters
+    // filePaths - A "+" delimited list of file paths, corresponding to config files to load
+    // configStringToAppend - A config string which should be processed together with the config files
+    void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
+    {
+        std::string configString = ReadConfigFiles(filePaths);
+        if(configStringToAppend != nullptr)
+        {
+            configString += *configStringToAppend;
+        }
+
+        FileParse(configString);
+    }
+
+    // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
+    //     If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
+    //     then this method will resolve those variables.  This method is meant for the processing of NDL/MEL config files,
+    //     in order to allow them to access variables defined in the primary config file via $varName$ syntax.
+    // filePath - filePath to the file to load
+    // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
+    void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
+    {
+        // read file, resolve variables, and then parse.
+        std::string fileContents = ReadConfigFile(filePath);
+        fileContents = config.ResolveVariables(fileContents);
+        FileParse(fileContents);
+    }
+
+    // LoadConfigFile - load a configuration file, and add to config parameters
+    // filePath - filePath to the file to read
+    void ConfigParser::LoadConfigFile(const std::wstring &filePath)
+    {
+        // read and then parse
+        FileParse(ReadConfigFile(filePath));
+    }
+    
+    // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
+    std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
+    {
+        return ReadConfigFiles(msra::strfun::utf16(filePaths));
+    }
+
+    // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
+    // filePaths - A "+" delimited list of file paths, corresponding to config files to read
+    // returns: a string with the concatentated file contents
+    std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
+    {
+        std::string configString;
+        std::vector<std::wstring> filePathVec = msra::strfun::split (filePaths, L"+");
+        for (auto filePath : filePathVec)
+        {
+            configString += ReadConfigFile(filePath);
+        }
+        return configString;
+    }
+
+    // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
+    std::string ConfigParser::ReadConfigFile(const std::string &filePath)
+    {
+        return ReadConfigFile(msra::strfun::utf16(filePath));
+    }
+
+    // ReadConfigFile - read a configuration file, and return as a string
+    // filePath - the path to the config file to read
+    // returns: a string with the concatentated file contents
+    std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
+    {
+        File file(filePath, fileOptionsRead);
+
+        // initialize with file name
+        std::string path = msra::strfun::utf8(filePath);
+        auto location = path.find_last_of("/\\");
+        if (location != npos)
+            path = path.substr(location+1);
+        m_configName = move(path);
+
+        // read the entire file into a string
+        // CONSIDER: should the File API support this, instead of line by line?
+        size_t fileLength = file.Size();
+        string str;
+        string configFile;
+        configFile.reserve(fileLength);
+        while (!file.IsEOF())
+        {
+            file.GetLine(str);
+            str = PreprocessConfigLine(str);
+            if (str != "")
+            {
+                configFile.append(str);
+                configFile.append("\n");
+            }
+        }
+        return configFile;
+    }
+
+    // GetFileConfigNames - determine the names of the features and labels sections in the config file
+    // features - [in,out] a vector of feature name strings
+    // labels - [in,out] a vector of label name strings
+    void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels)
+    {
+        for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
+        {
+            auto pair = *iter;
+            ConfigParameters temp (iter->second);
+            // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
+            if (temp.ExistsCurrent("dim"))
+            {
+                if (temp.ExistsCurrent("labelMappingFile") 
+                    || temp.ExistsCurrent("labelDim")
+                    || temp.ExistsCurrent("labelType")
+                    || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
+                {
+                    labels.push_back(msra::strfun::utf16(iter->first));
+                }
+                else
+                {
+                    features.push_back(msra::strfun::utf16(iter->first));
+                }
+            }
+        }
+    }
+
+    // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
+    // config - configuration to search
+    // key - string we ar searching for in each config section
+    // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
+    void FindConfigNames(const ConfigParameters& config, std::string key, std::vector<std::wstring>& names)
+    {
+        for (auto iter = config.begin(); iter != config.end(); ++iter)
+        {
+            auto pair = *iter;
+            ConfigParameters temp (iter->second);
+            // see if we have a config parameters that contains a "key" element, if so use it
+            if (temp.ExistsCurrent(key))
+            {
+                names.push_back(msra::strfun::utf16(iter->first));
+            }
+        }
+    }
+
+    // Trim - trim white space off the start and end of the string
+    // str - string to trim
+    // NOTE: if the entire string is empty, then the string will be set to an empty string
+    void Trim(std::string& str)
+    {
+        auto found = str.find_first_not_of(" \t");
+        if (found == npos)
+        {
+            str.erase(0);
+            return;
+        }
+        str.erase(0, found);
+        found = str.find_last_not_of(" \t");
+        if (found != npos)
+            str.erase(found+1);
+    }
+
 }}}
--- a/Common/File.cpp
+++ b/Common/File.cpp
--- a/Common/Include/TimerUtility.h
+++ b/Common/Include/TimerUtility.h
@ -0,0 +1,13 @@
+#pragma once
+
+#define MS_PER_SEC 1000
+
+namespace Microsoft{namespace MSR {namespace CNTK {
+    class Timer
+    {
+    public:
+        Timer(){};
+        ~Timer(){};
+        static unsigned long long MilliSecondElapsed();
+    };
+}}}
--- a/Common/TimerUtility.cpp
+++ b/Common/TimerUtility.cpp
@ -0,0 +1,39 @@
+#include "TimerUtility.h"
+
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <time.h>
+#endif
+namespace Microsoft{
+    namespace MSR {
+        namespace CNTK {
+
+            //Returns the amount of milliseconds elapsed
+            unsigned long long Timer::MilliSecondElapsed()
+            {
+#ifdef WIN32
+                FILETIME ft;
+                LARGE_INTEGER li;
+
+                GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
+                li.LowPart = ft.dwLowDateTime;
+                li.HighPart = ft.dwHighDateTime;
+
+                unsigned long long ret = li.QuadPart;
+                ret -= 116444736000000000LL; // Make the values consistent with Linux. 
+                ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3) 
+
+                return ret;
+#else
+                timespec ts;
+                clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
+
+                UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
+
+                return ret;
+#endif
+            }
+        }
+    }
+}
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -4,7 +4,10 @@
 // </copyright>
 //

-#define _CRT_SECURE_NO_WARNINGS     // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
 #pragma warning (disable: 4996)     // ^^ this does not seem to work--TODO: make it work
 #define _FILE_OFFSET_BITS 64        // to force fseeko() and ftello() 64 bit in Linux
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -49,17 +49,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_truncated = readerConfig("Truncated", "false");
        m_convertLabelsToTargets = false;

-        m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+        ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
+        m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;

-        if (m_numberOfuttsPerMinibatch < 1)
+        for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
        {
-            LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
+            if (m_numberOfuttsPerMinibatch < 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            }
+
+            if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
+            }
        }

-        if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
-        {
-            LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
-        }
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];

        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // get the read method, defaults to "blockRandomize" other option is "rollingWindow"
        std::string readMethod(readerConfig("readMethod","blockRandomize"));

+        if (readMethod == "blockRandomize" && randomize == randomizeNone)
+        {
+            fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
+            randomize = randomizeAuto;
+        }
+
        // see if they want to use readAhead
        m_readAhead = readerConfig("readAhead", "false");

@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // now get the frame source. This has better randomization and doesn't create temp files
            m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
+			m_frameSource->setverbosity(verbosity);
            //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);

        }
@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        m_mbSize = mbSize;

+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
+
+        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
+        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
+        m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
+        m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
+        m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
+
        if (m_trainOrTest)
        {
            StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@ -1,3 +1,4 @@
+<<<<<<< HEAD
 //
 // <copyright file="HTKMLFReader.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
@ -111,4 +112,117 @@ public:
    void SetSentenceEnd(int /*actualMbSize*/){};
 };

+=======
+//
+// <copyright file="HTKMLFReader.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
+#pragma once
+#include "DataReader.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+class HTKMLFReader : public IDataReader<ElemType>
+{
+private:
+    msra::dbn::minibatchiterator* m_mbiter;
+    msra::dbn::minibatchsource* m_frameSource;
+    msra::dbn::minibatchreadaheadsource* m_readAheadSource;
+     msra::dbn::FileEvalSource* m_fileEvalSource; 
+    msra::dbn::latticesource* m_lattices;
+    map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
+    
+    vector<bool> m_sentenceEnd;
+    bool m_readAhead;
+    bool m_truncated;
+    vector<size_t> m_processedFrame;
+    intargvector m_numberOfuttsPerMinibatchForAllEpochs;
+    size_t m_numberOfuttsPerMinibatch;
+    size_t m_actualnumberOfuttsPerMinibatch;
+    size_t m_mbSize;
+    vector<size_t> m_toProcess;
+    vector<size_t> m_switchFrame;
+    bool m_noData;
+
+    bool m_trainOrTest; // if false, in file writing mode
+ 
+    std::map<LabelIdType, LabelType> m_idToLabelMap;
+    
+    bool m_partialMinibatch; // allow partial minibatches?
+    
+    std::vector<ElemType*> m_featuresBufferMultiUtt;
+    std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
+    std::vector<ElemType*> m_labelsBufferMultiUtt;
+    std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
+    std::vector<size_t> m_featuresStartIndexMultiUtt;
+    std::vector<size_t> m_labelsStartIndexMultiUtt;
+
+    std::vector<ElemType*> m_featuresBufferMultiIO;
+    std::vector<size_t> m_featuresBufferAllocatedMultiIO;
+    std::vector<ElemType*> m_labelsBufferMultiIO;
+    std::vector<size_t> m_labelsBufferAllocatedMultiIO;
+
+    std::map<std::wstring,size_t> m_featureNameToIdMap;
+    std::map<std::wstring,size_t> m_labelNameToIdMap;
+    std::map<std::wstring,size_t> m_nameToTypeMap;
+    std::map<std::wstring,size_t> m_featureNameToDimMap;
+    std::map<std::wstring,size_t> m_labelNameToDimMap;
+    // for writing outputs to files (standard single input/output network) - deprecate eventually
+    bool m_checkDictionaryKeys;
+    bool m_convertLabelsToTargets;
+    std::vector <bool> m_convertLabelsToTargetsMultiIO;
+    std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
+ 
+    size_t m_inputFileIndex;
+    std::vector<size_t> m_featDims;
+    std::vector<size_t> m_labelDims;
+
+    std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
+     
+    void PrepareForTrainingOrTesting(const ConfigParameters& config);
+    void PrepareForWriting(const ConfigParameters& config);
+    
+    bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    
+    void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+
+    bool ReNewBufferForMultiIO(size_t i);
+
+    size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} 
+    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+
+     void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
+
+    
+    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    enum InputOutputTypes
+    {
+        real,
+        category,
+    };
+
+
+
+public:
+    virtual void Init(const ConfigParameters& config);
+    virtual void Destroy() {delete this;}
+    virtual ~HTKMLFReader();
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
+    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+
+    virtual bool DataEnd(EndDataType endDataType);
+    void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
+    void SetSentenceEnd(int /*actualMbSize*/){};
+};
+
+>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
 }}}
--- a/DataReader/HTKMLFReader/rollingwindowsource.h
+++ b/DataReader/HTKMLFReader/rollingwindowsource.h
--- a/DataReader/HTKMLFReader/utterancesource.h
+++ b/DataReader/HTKMLFReader/utterancesource.h
@ -768,6 +768,7 @@ private:
        if (chunkdata.isinram())
            return false;

+		if (verbosity)
        fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
        msra::util::attempt (5, [&]()   // (reading from network)
        {
@ -858,6 +859,7 @@ public:
            transcripts.clear();

            // return these utterances
+			if (verbosity > 0)
            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
@ -922,6 +924,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[lastchunk].windowend;
+			if (verbosity > 0)
            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource
        bool isinram() const { return !frames.empty(); }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
-        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
+        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
        {
            if (numutterances() == 0)
                throw std::logic_error ("requiredata: cannot page in virgin block");
@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
+				if (verbosity)
                fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
            }
            catch (...)
@ -568,6 +569,7 @@ private:
            return sweep;

        currentsweep = sweep;
+		if (verbosity>0)
        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep
@ -919,10 +921,11 @@ private:
            {
                auto & chunk = randomizedchunks[m][chunkindex];
                auto & chunkdata = chunk.getchunkdata();
+				if (verbosity)
                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                msra::util::attempt (5, [&]()   // (reading from network)
                {
-                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
+                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                });
            }
            chunksinram++;
@ -1029,7 +1032,8 @@ public:
                }
            }
            // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
+			if (verbosity > 0)
+				fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
@ -1107,6 +1111,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
+			if (verbosity)
            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
@ -1230,3 +1235,4 @@ public:
 };

 };};
+
--- a/DataReader/SequenceReader/SequenceParser.h
+++ b/DataReader/SequenceReader/SequenceParser.h
--- a/DataReader/SequenceReader/SequenceReader.cpp
+++ b/DataReader/SequenceReader/SequenceReader.cpp
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj
@ -60,7 +60,7 @@
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@ -79,7 +79,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
@ -107,6 +107,7 @@
    <ClInclude Include="..\..\Common\Include\Eval.h" />
    <ClInclude Include="..\..\Common\Include\File.h" />
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
    <ClInclude Include="EvalReader.h" />
    <ClInclude Include="EvalWriter.h" />
    <ClInclude Include="stdafx.h" />
@ -127,6 +128,7 @@
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
    <ClCompile Include="..\cn\ComputationNode.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters
@ -1,50 +1,56 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClCompile Include="..\..\Common\BestGpu.cpp" />
-    <ClCompile Include="..\cn\ComputationNode.cpp" />
-    <ClCompile Include="..\cn\PTaskGraphBuilder.cpp" />
-    <ClCompile Include="dllmain.cpp" />
-    <ClCompile Include="stdafx.cpp" />
-    <ClCompile Include="CNTKEval.cpp" />
-    <ClCompile Include="..\..\Common\ConfigFile.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Eval.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="EvalReader.h" />
-    <ClInclude Include="EvalWriter.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
-    <ClInclude Include="CNTKEval.h" />
-    <ClInclude Include="..\..\Common\Include\Eval.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\basetypes.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\File.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\Common\Include\fileutil.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <Filter Include="Common">
-      <UniqueIdentifier>{bed53b47-70b1-494c-824d-0748362003b2}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Common\Include">
-      <UniqueIdentifier>{f3bf0104-8a08-40c9-a4d9-af8411c49669}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\Common\BestGpu.cpp" />
+    <ClCompile Include="..\cn\ComputationNode.cpp" />
+    <ClCompile Include="..\cn\PTaskGraphBuilder.cpp" />
+    <ClCompile Include="dllmain.cpp" />
+    <ClCompile Include="stdafx.cpp" />
+    <ClCompile Include="CNTKEval.cpp" />
+    <ClCompile Include="..\..\Common\ConfigFile.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\Eval.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="EvalReader.h" />
+    <ClInclude Include="EvalWriter.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="CNTKEval.h" />
+    <ClInclude Include="..\..\Common\Include\Eval.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\basetypes.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\File.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\fileutil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Common">
+      <UniqueIdentifier>{bed53b47-70b1-494c-824d-0748362003b2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common\Include">
+      <UniqueIdentifier>{f3bf0104-8a08-40c9-a4d9-af8411c49669}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
 </Project>
--- a/MachineLearning/cn/ComputationNode.h
+++ b/MachineLearning/cn/ComputationNode.h
@ -3111,6 +3111,9 @@ protected:  \
            inputGradientValues.Print("child Gradient-in/out");
            inputFunctionValues.Print("child Function values");
 #endif
+            //currently we only support one combination when the input is sparse.
+            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol);

                Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
 #if DUMPOUTPUT
--- a/MachineLearning/cn/NetworkDescriptionLanguage.h
+++ b/MachineLearning/cn/NetworkDescriptionLanguage.h
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
--- a/MachineLearning/cn/SimpleEvaluator.h
+++ b/MachineLearning/cn/SimpleEvaluator.h
@ -1,350 +1,349 @@
-//
-// <copyright file="SimpleEvaluator.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include "ComputationNetwork.h"
-#include "ComputationNetworkHelper.h"
-#include "DataReader.h"
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include "basetypes.h"
-#include "fileutil.h"
-#include "commandArgUtil.h"
-#include <fstream>
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    template<class ElemType>
-    class SimpleEvaluator : ComputationNetworkHelper<ElemType>
-    {
-        typedef ComputationNetworkHelper<ElemType> B;
-        using B::UpdateEvalTimeStamps;
-    protected:
-        typedef ComputationNode<ElemType>* ComputationNodePtr;
-        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
-
-    public:
-
-        SimpleEvaluator(ComputationNetwork<ElemType>& net,  const size_t numMBsToShowResult=100, const int traceLevel=0) 
-            : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel)
-        {
-        }
-
-        //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
-        vector<ElemType> Evaluate(IDataReader<ElemType>& dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize,  const size_t testSize=requestDataSize)
-        {
-            //specify evaluation nodes
-            std::vector<ComputationNodePtr> evalNodes;
-
-            if (evalNodeNames.size() == 0)
-            {
-                fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n");
-                if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0)
-                    throw std::logic_error("There is no default evalnodes or training criterion node specified in the network.");
-            
-                for (int i=0; i< m_net.EvaluationNodes().size(); i++)
-                    evalNodes.push_back(m_net.EvaluationNodes()[i]);
-
-                for (int i=0; i< m_net.FinalCriterionNodes().size(); i++)
-                    evalNodes.push_back(m_net.FinalCriterionNodes()[i]);
-            }
-            else
-            {
-                for (int i=0; i<evalNodeNames.size(); i++)
-                {
-                    ComputationNodePtr node = m_net.GetNodeFromName(evalNodeNames[i]);
-                    m_net.BuildAndValidateNetwork(node);
-                    if (!node->FunctionValues().GetNumElements() == 1)
-                    {
-                        throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value).");
-                    }
-                    evalNodes.push_back(node);
-                }
-            }
-
-            //initialize eval results
-            std::vector<ElemType> evalResults;
-            for (int i=0; i< evalNodes.size(); i++)
-            {
-                evalResults.push_back((ElemType)0);
-                evalNodes[i]->Reset();
-            }
-
-            //prepare features and labels
-            std::vector<ComputationNodePtr> & FeatureNodes = m_net.FeatureNodes();
-            std::vector<ComputationNodePtr> & labelNodes = m_net.LabelNodes();
-
-            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
-            for (size_t i=0; i<FeatureNodes.size(); i++)
-            {
-                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
-            }
-            for (size_t i=0; i<labelNodes.size(); i++)
-            {
-                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
-            }
-
-            //evaluate through minibatches
-            size_t totalEpochSamples = 0;            
-            size_t numMBsRun = 0;
-            size_t actualMBSize = 0;
-            size_t numSamplesLastMBs = 0;
-            size_t lastMBsRun = 0; //MBs run before this display
-
-            std::vector<ElemType> evalResultsLastMBs;
-            for (int i=0; i< evalResults.size(); i++)
-                evalResultsLastMBs.push_back((ElemType)0);
-
-            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
-            dataReader.SetNbrSlicesEachRecurrentIter(1);
-
-            for (int i=0; i<evalNodes.size(); i++)
-            {
-                if (evalNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
-                {
-                    size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
-                    if(inputMatrices.find(L"classinfo") == inputMatrices.end())
-                    {
-                        inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
-                        inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
-                    }
-                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i];
-                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
-                }
-            }
-
-            while (dataReader.GetMinibatch(inputMatrices))
-            {
-                UpdateEvalTimeStamps(FeatureNodes);
-                UpdateEvalTimeStamps(labelNodes);
-
-                actualMBSize = m_net.GetActualMBSize();
-                m_net.SetActualMiniBatchSize(actualMBSize);
-                m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter());
-                dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); 
-
-                for (int i=0; i<evalNodes.size(); i++)
-                {
-                    m_net.Evaluate(evalNodes[i]);
-                    evalResults[i] += evalNodes[i]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-                }
-
-                totalEpochSamples += actualMBSize;
-                numMBsRun++;
-
-                if (m_traceLevel > 0)
-                {
-                    numSamplesLastMBs += actualMBSize; 
-
-                if (numMBsRun % m_numMBsToShowResult == 0)
-                {
-                        DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
-
-                        for (int i=0; i<evalResults.size(); i++)
-                        {
-                            evalResultsLastMBs[i] = evalResults[i];
-                        }
-                        numSamplesLastMBs = 0; 
-                        lastMBsRun = numMBsRun;
-                    }
-                }
-
-                /// call DataEnd to check if end of sentence is reached
-                /// datareader will do its necessary/specific process for sentence ending 
-                dataReader.DataEnd(endDataSentence); 
-            }
-
-            // show last batch of results
-            if (m_traceLevel > 0 && numSamplesLastMBs > 0)
-            {
-                  DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
-            }
-            
-            //final statistics
-            for (int i=0; i<evalResultsLastMBs.size(); i++)
-            {
-                evalResultsLastMBs[i] = 0;
-            }
-
-            fprintf(stderr,"Final Results: ");
-            DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs);
-            
-            for (int i=0; i<evalResults.size(); i++)
-            {
-                evalResults[i] /= totalEpochSamples;
-            }
-
-            if (inputMatrices[L"classinfo"])
-            {
-                delete inputMatrices[L"classinfo"];
-                inputMatrices.erase(L"classinfo");
-            }
-            if (inputMatrices[L"idx2cls"])
-            {
-                delete inputMatrices[L"idx2cls"];
-                inputMatrices.erase(L"idx2cls");
-            }
-
-            return evalResults;
-        }        
-
-        //returns error rate
-        ElemType EvaluateUnroll(IDataReader<ElemType>& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
-        {
-
-            std::vector<ComputationNodePtr> FeatureNodes = m_net.FeatureNodes();
-            std::vector<ComputationNodePtr> labelNodes = m_net.LabelNodes();
-            std::vector<ComputationNodePtr> criterionNodes = m_net.FinalCriterionNodes();
-            std::vector<ComputationNodePtr> evaluationNodes = m_net.EvaluationNodes();
-            
-            if (criterionNodes.size()==0)
-            {
-                throw std::runtime_error("No CrossEntropyWithSoftmax node found\n");
-            }
-            if (evaluationNodes.size()==0)
-            {
-                throw std::runtime_error("No Evaluation node found\n");
-            }
-
-            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
-            for (size_t i=0; i<FeatureNodes.size(); i++)
-            {
-                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
-            }
-            for (size_t i=0; i<labelNodes.size(); i++)
-            {
-                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
-            }
-            inputMatrices[L"numberobs"] = new Matrix<ElemType>(1,1, m_net.GetDeviceID()); 
-
-            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
-
-            ElemType epochEvalError = 0;
-            ElemType epochCrossEntropy = 0;
-            size_t totalEpochSamples = 0;
-            ElemType prevEpochEvalError = 0;
-            ElemType prevEpochCrossEntropy = 0;
-            size_t prevTotalEpochSamples = 0;
-            size_t prevStart = 1;
-            size_t numSamples = 0;
-            ElemType crossEntropy = 0;
-            ElemType evalError = 0;
-            
-            ofstream outputStream;
-            if (output)
-            {
-#ifdef _MSC_VER
-                outputStream.open(output);
-#else
-                outputStream.open(charpath(output));    // GCC does not implement wide-char pathnames here
-#endif
-            }
-
-            size_t numMBsRun = 0;
-            size_t actualMBSize = 0;
-            while (dataReader.GetMinibatch(inputMatrices))
-            {
-                size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0);
-                actualMBSize = nbrSamples;
-
-                for (int npos = 0; npos < nbrSamples ; npos++)
-                {
-                    FeatureNodes[npos]->UpdateEvalTimeStamp();
-                    labelNodes[npos]->UpdateEvalTimeStamp();
-
-                    m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more?
-
-                    m_net.Evaluate(evaluationNodes[npos]);
-
-                    ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar
-                    epochCrossEntropy += mbCrossEntropy;
-
-                    ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
-
-                    epochEvalError += mbEvalError;
-                }
-
-                totalEpochSamples += actualMBSize;
-
-                if (outputStream.is_open())
-                {
-                    //TODO: add support to dump multiple outputs
-                    ComputationNodePtr outputNode = m_net.OutputNodes()[0];
-                    foreach_column(j, outputNode->FunctionValues())
-                    {
-                        foreach_row(i,outputNode->FunctionValues())
-                        {
-                            outputStream<<outputNode->FunctionValues()(i,j)<<" ";
-                        }
-                        outputStream<<endl;
-                    }
-                }
-
-                numMBsRun++;
-                if (numMBsRun % m_numMBsToShowResult == 0)
-                {
-                    numSamples = (totalEpochSamples - prevTotalEpochSamples);
-                    crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
-                    evalError = epochEvalError - prevEpochEvalError;
-
-                    fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
-                            prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
-
-                    prevTotalEpochSamples = totalEpochSamples;
-                    prevEpochCrossEntropy = epochCrossEntropy;
-                    prevEpochEvalError = epochEvalError;
-                    prevStart = numMBsRun + 1;
-                }
-
-            }
-
-            // show final grouping of output
-            numSamples = totalEpochSamples - prevTotalEpochSamples;
-            if (numSamples > 0)
-            {
-                crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
-                evalError = epochEvalError - prevEpochEvalError;
-                fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
-                    prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
-            }
-
-            //final statistics
-            epochEvalError /= (ElemType)totalEpochSamples;
-            epochCrossEntropy /= (ElemType)totalEpochSamples;
-            fprintf(stderr, "Overall: Samples Evaluated = %lu   EvalErr Per Sample = %.8g   Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
-            if (outputStream.is_open())
-            {
-                outputStream.close();
-            }
-            evalSetCrossEntropy = epochCrossEntropy;
-            return epochEvalError;
-        }
-
-    protected:
-        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodePtr>& evalNodes, 
-            const vector<ElemType> & evalResults, const vector<ElemType> & evalResultsLastMBs)
-        {
-            fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);
-
-            for (size_t i=0; i<evalResults.size(); i++)
-            {
-                fprintf(stderr, "%ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-    protected: 
-        ComputationNetwork<ElemType>& m_net;
-        size_t m_numMBsToShowResult;
-        int m_traceLevel;
-        void operator=(const SimpleEvaluator&); // (not assignable)
-    };
-
-}}}
+//
+// <copyright file="SimpleEvaluator.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+#include "ComputationNetwork.h"
+#include "ComputationNetworkHelper.h"
+#include "DataReader.h"
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include "basetypes.h"
+#include "fileutil.h"
+#include "commandArgUtil.h"
+#include <fstream>
+
+using namespace std;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    template<class ElemType>
+    class SimpleEvaluator : ComputationNetworkHelper<ElemType>
+    {
+        typedef ComputationNetworkHelper<ElemType> B;
+        using B::UpdateEvalTimeStamps;
+    protected:
+        typedef ComputationNode<ElemType>* ComputationNodePtr;
+        typedef ClassBasedCrossEntropyWithSoftmaxNode<ElemType>* ClassBasedCrossEntropyWithSoftmaxNodePtr;
+
+    public:
+
+        SimpleEvaluator(ComputationNetwork<ElemType>& net,  const size_t numMBsToShowResult=100, const int traceLevel=0) 
+            : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel)
+        {
+        }
+
+        //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
+        vector<ElemType> Evaluate(IDataReader<ElemType>& dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize,  const size_t testSize=requestDataSize)
+        {
+            //specify evaluation nodes
+            std::vector<ComputationNodePtr> evalNodes;
+
+            if (evalNodeNames.size() == 0)
+            {
+                fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n");
+                if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0)
+                    throw std::logic_error("There is no default evalnodes or training criterion node specified in the network.");
+            
+                for (int i=0; i< m_net.EvaluationNodes().size(); i++)
+                    evalNodes.push_back(m_net.EvaluationNodes()[i]);
+
+                for (int i=0; i< m_net.FinalCriterionNodes().size(); i++)
+                    evalNodes.push_back(m_net.FinalCriterionNodes()[i]);
+            }
+            else
+            {
+                for (int i=0; i<evalNodeNames.size(); i++)
+                {
+                    ComputationNodePtr node = m_net.GetNodeFromName(evalNodeNames[i]);
+                    m_net.BuildAndValidateNetwork(node);
+                    if (!node->FunctionValues().GetNumElements() == 1)
+                    {
+                        throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value).");
+                    }
+                    evalNodes.push_back(node);
+                }
+            }
+
+            //initialize eval results
+            std::vector<ElemType> evalResults;
+            for (int i=0; i< evalNodes.size(); i++)
+            {
+                evalResults.push_back((ElemType)0);
+                evalNodes[i]->Reset();
+            }
+
+            //prepare features and labels
+            std::vector<ComputationNodePtr> & FeatureNodes = m_net.FeatureNodes();
+            std::vector<ComputationNodePtr> & labelNodes = m_net.LabelNodes();
+
+            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
+            for (size_t i=0; i<FeatureNodes.size(); i++)
+            {
+                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
+            }
+            for (size_t i=0; i<labelNodes.size(); i++)
+            {
+                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
+            }
+
+            //evaluate through minibatches
+            size_t totalEpochSamples = 0;            
+            size_t numMBsRun = 0;
+            size_t actualMBSize = 0;
+            size_t numSamplesLastMBs = 0;
+            size_t lastMBsRun = 0; //MBs run before this display
+
+            std::vector<ElemType> evalResultsLastMBs;
+            for (int i=0; i< evalResults.size(); i++)
+                evalResultsLastMBs.push_back((ElemType)0);
+
+            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
+
+            for (int i=0; i<evalNodes.size(); i++)
+            {
+                if (evalNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax")
+                {
+                    size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows();
+                    if(inputMatrices.find(L"classinfo") == inputMatrices.end())
+                    {
+                        inputMatrices[L"idx2cls"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
+                        inputMatrices[L"classinfo"] = new Matrix<ElemType>(vSz, 1, m_net.GetDeviceID()); 
+                    }
+                    ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i];
+                    crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]);
+                }
+            }
+
+            while (dataReader.GetMinibatch(inputMatrices))
+            {
+                UpdateEvalTimeStamps(FeatureNodes);
+                UpdateEvalTimeStamps(labelNodes);
+
+                actualMBSize = m_net.GetActualMBSize();
+                m_net.SetActualMiniBatchSize(actualMBSize);
+                m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter());
+                dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); 
+
+                for (int i=0; i<evalNodes.size(); i++)
+                {
+                    m_net.Evaluate(evalNodes[i]);
+                    evalResults[i] += evalNodes[i]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+                }
+
+                totalEpochSamples += actualMBSize;
+                numMBsRun++;
+
+                if (m_traceLevel > 0)
+                {
+                    numSamplesLastMBs += actualMBSize; 
+
+                if (numMBsRun % m_numMBsToShowResult == 0)
+                {
+                        DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+
+                        for (int i=0; i<evalResults.size(); i++)
+                        {
+                            evalResultsLastMBs[i] = evalResults[i];
+                        }
+                        numSamplesLastMBs = 0; 
+                        lastMBsRun = numMBsRun;
+                    }
+                }
+
+                /// call DataEnd to check if end of sentence is reached
+                /// datareader will do its necessary/specific process for sentence ending 
+                dataReader.DataEnd(endDataSentence); 
+            }
+
+            // show last batch of results
+            if (m_traceLevel > 0 && numSamplesLastMBs > 0)
+            {
+                  DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+            }
+            
+            //final statistics
+            for (int i=0; i<evalResultsLastMBs.size(); i++)
+            {
+                evalResultsLastMBs[i] = 0;
+            }
+
+            fprintf(stderr,"Final Results: ");
+            DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs);
+            
+            for (int i=0; i<evalResults.size(); i++)
+            {
+                evalResults[i] /= totalEpochSamples;
+            }
+
+            if (inputMatrices[L"classinfo"])
+            {
+                delete inputMatrices[L"classinfo"];
+                inputMatrices.erase(L"classinfo");
+            }
+            if (inputMatrices[L"idx2cls"])
+            {
+                delete inputMatrices[L"idx2cls"];
+                inputMatrices.erase(L"idx2cls");
+            }
+
+            return evalResults;
+        }        
+
+        //returns error rate
+        ElemType EvaluateUnroll(IDataReader<ElemType>& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
+        {
+
+            std::vector<ComputationNodePtr> FeatureNodes = m_net.FeatureNodes();
+            std::vector<ComputationNodePtr> labelNodes = m_net.LabelNodes();
+            std::vector<ComputationNodePtr> criterionNodes = m_net.FinalCriterionNodes();
+            std::vector<ComputationNodePtr> evaluationNodes = m_net.EvaluationNodes();
+            
+            if (criterionNodes.size()==0)
+            {
+                throw std::runtime_error("No CrossEntropyWithSoftmax node found\n");
+            }
+            if (evaluationNodes.size()==0)
+            {
+                throw std::runtime_error("No Evaluation node found\n");
+            }
+
+            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
+            for (size_t i=0; i<FeatureNodes.size(); i++)
+            {
+                inputMatrices[FeatureNodes[i]->NodeName()] = &FeatureNodes[i]->FunctionValues();
+            }
+            for (size_t i=0; i<labelNodes.size(); i++)
+            {
+                inputMatrices[labelNodes[i]->NodeName()] = &labelNodes[i]->FunctionValues();                
+            }
+            inputMatrices[L"numberobs"] = new Matrix<ElemType>(1,1, m_net.GetDeviceID()); 
+
+            dataReader.StartMinibatchLoop(mbSize, 0, testSize);
+
+            ElemType epochEvalError = 0;
+            ElemType epochCrossEntropy = 0;
+            size_t totalEpochSamples = 0;
+            ElemType prevEpochEvalError = 0;
+            ElemType prevEpochCrossEntropy = 0;
+            size_t prevTotalEpochSamples = 0;
+            size_t prevStart = 1;
+            size_t numSamples = 0;
+            ElemType crossEntropy = 0;
+            ElemType evalError = 0;
+            
+            ofstream outputStream;
+            if (output)
+            {
+#ifdef _MSC_VER
+                outputStream.open(output);
+#else
+                outputStream.open(charpath(output));    // GCC does not implement wide-char pathnames here
+#endif
+            }
+
+            size_t numMBsRun = 0;
+            size_t actualMBSize = 0;
+            while (dataReader.GetMinibatch(inputMatrices))
+            {
+                size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0);
+                actualMBSize = nbrSamples;
+
+                for (int npos = 0; npos < nbrSamples ; npos++)
+                {
+                    FeatureNodes[npos]->UpdateEvalTimeStamp();
+                    labelNodes[npos]->UpdateEvalTimeStamp();
+
+                    m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more?
+
+                    m_net.Evaluate(evaluationNodes[npos]);
+
+                    ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar
+                    epochCrossEntropy += mbCrossEntropy;
+
+                    ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar
+
+                    epochEvalError += mbEvalError;
+                }
+
+                totalEpochSamples += actualMBSize;
+
+                if (outputStream.is_open())
+                {
+                    //TODO: add support to dump multiple outputs
+                    ComputationNodePtr outputNode = m_net.OutputNodes()[0];
+                    foreach_column(j, outputNode->FunctionValues())
+                    {
+                        foreach_row(i,outputNode->FunctionValues())
+                        {
+                            outputStream<<outputNode->FunctionValues()(i,j)<<" ";
+                        }
+                        outputStream<<endl;
+                    }
+                }
+
+                numMBsRun++;
+                if (numMBsRun % m_numMBsToShowResult == 0)
+                {
+                    numSamples = (totalEpochSamples - prevTotalEpochSamples);
+                    crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
+                    evalError = epochEvalError - prevEpochEvalError;
+
+                    fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
+                            prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
+
+                    prevTotalEpochSamples = totalEpochSamples;
+                    prevEpochCrossEntropy = epochCrossEntropy;
+                    prevEpochEvalError = epochEvalError;
+                    prevStart = numMBsRun + 1;
+                }
+
+            }
+
+            // show final grouping of output
+            numSamples = totalEpochSamples - prevTotalEpochSamples;
+            if (numSamples > 0)
+            {
+                crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
+                evalError = epochEvalError - prevEpochEvalError;
+                fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu    EvalErr Per Sample = %.8g    Loss Per Sample = %.8g\n",
+                    prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
+            }
+
+            //final statistics
+            epochEvalError /= (ElemType)totalEpochSamples;
+            epochCrossEntropy /= (ElemType)totalEpochSamples;
+            fprintf(stderr, "Overall: Samples Evaluated = %lu   EvalErr Per Sample = %.8g   Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
+            if (outputStream.is_open())
+            {
+                outputStream.close();
+            }
+            evalSetCrossEntropy = epochCrossEntropy;
+            return epochEvalError;
+        }
+
+    protected:
+        void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodePtr>& evalNodes, 
+            const vector<ElemType> & evalResults, const vector<ElemType> & evalResultsLastMBs)
+        {
+            fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);
+
+            for (size_t i=0; i<evalResults.size(); i++)
+            {
+                fprintf(stderr, "%ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+    protected: 
+        ComputationNetwork<ElemType>& m_net;
+        size_t m_numMBsToShowResult;
+        int m_traceLevel;
+        void operator=(const SimpleEvaluator&); // (not assignable)
+    };
+
+}}}
--- a/MachineLearning/cn/SynchronousExecutionEngine.h
+++ b/MachineLearning/cn/SynchronousExecutionEngine.h
--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
--- a/MachineLearning/cn/cn.cpp
+++ b/MachineLearning/cn/cn.cpp
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@ -139,6 +139,9 @@
      <TreatOutputAsContent>true</TreatOutputAsContent>
      <Message>Copy content files to target directory</Message>
    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
@ -199,6 +202,9 @@
      <Message>
      </Message>
    </CustomBuildStep>
+    <PreBuildEvent>
+      <Command>prebuild.bat</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemGroup>
    <Text Include="DefaultMacros.txt" />
@ -216,6 +222,7 @@
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
    <ClInclude Include="..\..\Common\Include\hostname.h" />
    <ClInclude Include="..\..\Common\Include\nvml.h" />
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h" />
    <ClInclude Include="CompositeComputationNode.h" />
    <ClInclude Include="ComputationNetwork.h" />
    <ClInclude Include="ComputationNetworkHelper.h" />
@ -249,6 +256,7 @@
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp" />
    <ClCompile Include="cn.cpp" />
    <ClCompile Include="ComputationNode.cpp" />
    <ClCompile Include="ModelEditLanguage.cpp" />
--- a/MachineLearning/cn/cn.vcxproj.filters
+++ b/MachineLearning/cn/cn.vcxproj.filters
@ -43,6 +43,9 @@
    <ClCompile Include="NetworkDescriptionLanguage.cpp">
      <Filter>Network</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\Common\TimerUtility.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\Common\Include\basetypes.h">
@ -138,6 +141,9 @@
    <ClInclude Include="..\..\Common\Include\hostname.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\Common\Include\TimerUtility.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Text Include="modelEditor.txt">
--- a/MachineLearning/cn/prebuild.bat
+++ b/MachineLearning/cn/prebuild.bat
@ -0,0 +1,30 @@
+@echo off 
+
+
+echo #ifndef _BUILDINFO_H > buildinfo.h
+echo #define _BUILDINFO_H >> buildinfo.h 
+
+
+FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i           
+:: assuming hostname always exists 
+
+:: not sure whether git in path ? 
+git --version 2 > nul 
+if not %ERRORLEVEL% == 9909 (
+    echo #define _GIT_EXIST >> buildinfo.h
+    FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i
+    FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i
+    echo #define _BUILDBRANCH_  "%BRANCH%"      >> buildinfo.h
+    echo #define _BUILDSHA1_    "%COMMIT%"      >> buildinfo.h
+) 
+
+
+echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h 
+echo #define _BUILDMACHINE_ "%HOST%"    >> buildinfo.h
+
+set a=%~dp0
+set buildpath="%a:\=\\%"
+echo #define _BUILDPATH_    %buildpath%     >> buildinfo.h
+
+
+echo #endif >> buildinfo.h
--- a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
+++ b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp
@ -1,217 +1,264 @@
-//
-// <copyright file="MatrixUnitTests.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#include "stdafx.h"
-#include "CppUnitTest.h"
-#include "..\Math\Matrix.h"
-
-#pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project
-
-#define epsilon 0.000001
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-
-using namespace Microsoft::MSR::CNTK;
-using namespace Microsoft::VisualStudio::CppUnitTestFramework;
-
-
-namespace CNTKMathTest
-{    
-    TEST_CLASS(MatrixUnitTest)
-    {        
-
-    public:
-
-        //This test should fail if you don't have CUDA GPU (or working under remote desktop)
-        TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple)
-        {
-            Matrix<float> A;
-            A.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);
-            long n0 = A.MatrixNorm0();
-            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); 
-            A.SwitchToMatrixType(MatrixType::SPARSE);
-            Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType());
-            long n1 = A.MatrixNorm0();
-            Assert::AreEqual<long>(n0,n1);
-            A.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType());            
-        }
-
-        TEST_METHOD(MatrixSparseTimesDense)
-        {
-            Matrix<float> Ad; //DENSE
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE
-            Matrix<float> As(Ad);//DENSE
-            As.SwitchToMatrixType(MatrixType::SPARSE);  //!!! MATRIX As becomes sparse
-            Matrix<float> B = Matrix<float>::RandomGaussian(2048,128,1,4); //DENSE
-            Matrix<float> C = Matrix<float>::RandomGaussian(4096,128,1,2); //DENSE
-            Matrix<float> C1(C); //DENSE
-
-            float alpha = 0.3, beta = 2;
-            bool transposeA=false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE            
-            Assert::IsTrue(C1.IsEqualTo(C,0.00001));            
-        }
-
-        TEST_METHOD(MatrixDenseTimesSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
-
-            Matrix<float> B = Matrix<float>::RandomGaussian(2048,1024,1,4);
-            Matrix<float> C = Matrix<float>::RandomGaussian(2048,2048,1,2);
-            Matrix<float> C1(C);
-
-            float alpha = 0.3, beta = 0;
-            bool transposeA=false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
-            Assert::IsTrue(C1.IsEqualTo(C,0.0001));  
-
-            alpha = 3.3, beta = 1.3;            
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
-            Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
-        }
-
-        TEST_METHOD(MatrixSparseTimesSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048,1024,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            Matrix<float> Cd;
-            Cd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,1024,-4,0.2,0),0);
-            Matrix<float> Cs(Cd);
-
-            float alpha = 2.4, beta=0;
-            bool transposeA = false, transposeB=false;
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
-            Cs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001));  
-
-
-            alpha = 2.4, beta=3.4; 
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);            
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Cs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
-            Cs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); 
-        }
-
-        TEST_METHOD(MatrixSparsePlusSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,As,Bs);
-
-            Bs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixDensePlusSparse)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bs);
-
-            Bs.SwitchToMatrixType(MatrixType::DENSE);
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixSparsePlusDense)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-            Matrix<float> As(Ad);
-
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bd1(Bd);
-
-            float alpha = 1.0*rand() / RAND_MAX;
-            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
-
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            Matrix<float>::ScaleAndAdd(alpha,As,Bd1);
-
-            Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001));
-        }
-
-        TEST_METHOD(MatrixSparseElementWisePower)
-        {
-            Matrix<float> Ad;
-            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
-            Matrix<float> As(Ad);
-            As.SwitchToMatrixType(MatrixType::SPARSE);
-            
-            Matrix<float> Bd;
-            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
-            Matrix<float> Bs(Bd);
-            Bs.SwitchToMatrixType(MatrixType::SPARSE);
-
-            Ad^=2.3; //DENSE
-            As^=2.3; //SPARSE
-            Assert::IsTrue(As.IsEqualTo(Ad,0.00001));
-            Assert::IsTrue(Ad.IsEqualTo(As,0.00001));
-
-            Bd.AssignElementPowerOf(Ad,3.2);
-            Bs.AssignElementPowerOf(As,3.2);
-#ifdef CHECK
-            Bs.SwitchToMatrixType(DENSE);
-            Bd.TransferFromDeviceToDevice(0,CPUDEVICE);
-            Bs.TransferFromDeviceToDevice(0,CPUDEVICE);
-            for (int r = 0; r < Bd.GetNumRows(); ++r)
-                for (int c = 0; c < Bd.GetNumCols(); ++c)
-                {
-                    float dVal = Bd(r,c);
-                    float sVal = Bs(r,c);
-                    float diff = sVal - dVal;
-                    if (fabs(diff) > 0.001)
-                        cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal;
-                }
-#endif
-            Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001));
-            Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001));
-        }
-    };
+//
+// <copyright file="MatrixUnitTests.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#include "stdafx.h"
+#include "CppUnitTest.h"
+#include "..\Math\Matrix.h"
+
+#pragma warning (disable: 4244 4245 4305)       // conversions and truncations; we don't care in this test project
+
+#define epsilon 0.000001
+#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
+
+using namespace Microsoft::MSR::CNTK;
+using namespace Microsoft::VisualStudio::CppUnitTestFramework;
+
+
+namespace CNTKMathTest
+{    
+    TEST_CLASS(MatrixUnitTest)
+    {        
+
+    public:
+
+        //This test should fail if you don't have CUDA GPU (or working under remote desktop)
+        TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple)
+        {
+            Matrix<float> A;
+            A.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);
+            long n0 = A.MatrixNorm0();
+            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); 
+            A.SwitchToMatrixType(MatrixType::SPARSE);
+            Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType());
+            long n1 = A.MatrixNorm0();
+            Assert::AreEqual<long>(n0,n1);
+            A.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType());            
+        }
+
+        TEST_METHOD(MatrixSparseTimesDense)
+        {
+            Matrix<float> Ad; //DENSE
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE
+            Matrix<float> As(Ad);//DENSE
+            As.SwitchToMatrixType(MatrixType::SPARSE);  //!!! MATRIX As becomes sparse
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048,128,1,4); //DENSE
+            Matrix<float> C = Matrix<float>::RandomGaussian(4096,128,1,2); //DENSE
+            Matrix<float> C1(C); //DENSE
+
+            float alpha = 0.3, beta = 2;
+            bool transposeA=false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE            
+            Assert::IsTrue(C1.IsEqualTo(C,0.00001));            
+        }
+
+        TEST_METHOD(MatrixDenseTimesSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048,1024,1,4);
+            Matrix<float> C = Matrix<float>::RandomGaussian(2048,2048,1,2);
+            Matrix<float> C1(C);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA=false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
+            Assert::IsTrue(C1.IsEqualTo(C,0.0001));  
+
+            alpha = 3.3, beta = 1.3;            
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1);            
+            Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
+        }
+
+        TEST_METHOD(CPUMatrixDenseTimesSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024, 2048, -3, 0.1, 0), 0);
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> C1(C);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+            Assert::IsTrue(C1.IsEqualTo(C, 0.0001));
+
+            alpha = 3.3, beta = 1.3;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(C1.IsEqualTo(C, 0.00005));
+        }
+        
+        TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse)
+        {
+            Matrix<float> Ad(CPUDEVICE);
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048, 1024, -3, 0.1, 0), 0);
+
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
+
+            Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsCsc = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
+            Matrix<float> AsBlock(CPUDEVICE);
+            AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol);
+
+            float alpha = 0.3, beta = 0;
+            bool transposeA = false, transposeB = true;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock);
+            Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc);
+
+            // TODO IsEqualTo NYI
+            // Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001));
+        }
+
+        TEST_METHOD(MatrixSparseTimesSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048,1024,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            Matrix<float> Cd;
+            Cd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,1024,-4,0.2,0),0);
+            Matrix<float> Cs(Cd);
+
+            float alpha = 2.4, beta=0;
+            bool transposeA = false, transposeB=false;
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
+            Cs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001));  
+
+
+            alpha = 2.4, beta=3.4; 
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);            
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Cs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Matrix<float>::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs);
+            Cs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); 
+        }
+
+        TEST_METHOD(MatrixSparsePlusSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,As,Bs);
+
+            Bs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixDensePlusSparse)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bs);
+
+            Bs.SwitchToMatrixType(MatrixType::DENSE);
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixSparsePlusDense)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+            Matrix<float> As(Ad);
+
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bd1(Bd);
+
+            float alpha = 1.0*rand() / RAND_MAX;
+            Matrix<float>::ScaleAndAdd(alpha,Ad,Bd);
+
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            Matrix<float>::ScaleAndAdd(alpha,As,Bd1);
+
+            Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001));
+        }
+
+        TEST_METHOD(MatrixSparseElementWisePower)
+        {
+            Matrix<float> Ad;
+            Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-3,0.1,0),0);            
+            Matrix<float> As(Ad);
+            As.SwitchToMatrixType(MatrixType::SPARSE);
+            
+            Matrix<float> Bd;
+            Bd.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024,2048,-5,0.4,0),0);
+            Matrix<float> Bs(Bd);
+            Bs.SwitchToMatrixType(MatrixType::SPARSE);
+
+            Ad^=2.3; //DENSE
+            As^=2.3; //SPARSE
+            Assert::IsTrue(As.IsEqualTo(Ad,0.00001));
+            Assert::IsTrue(Ad.IsEqualTo(As,0.00001));
+
+            Bd.AssignElementPowerOf(Ad,3.2);
+            Bs.AssignElementPowerOf(As,3.2);
+#ifdef CHECK
+            Bs.SwitchToMatrixType(DENSE);
+            Bd.TransferFromDeviceToDevice(0,CPUDEVICE);
+            Bs.TransferFromDeviceToDevice(0,CPUDEVICE);
+            for (int r = 0; r < Bd.GetNumRows(); ++r)
+                for (int c = 0; c < Bd.GetNumCols(); ++c)
+                {
+                    float dVal = Bd(r,c);
+                    float sVal = Bs(r,c);
+                    float diff = sVal - dVal;
+                    if (fabs(diff) > 0.001)
+                        cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal;
+                }
+#endif
+            Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001));
+            Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001));
+        }
+    };
 }
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -33,7 +33,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        CPUSparseMatrix(const MatrixFormat format);
        CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
-
+        
+        
        ~CPUSparseMatrix();

    public:
@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        
        static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);

+        static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+
        /// sum(vec(a).*vec(b))
        static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/) { NOT_IMPLEMENTED; }
        
@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
        void Reset();

+        inline ElemType defaultElem()
+        {
+            ElemType default;
+            memset(&default, 0, sizeof(ElemType));
+            return default;
+        }
+
+        const ElemType& operator() (const size_t row, const size_t col) const
+        {
+            if (col >= m_numCols || row >= m_numRows)
+            {
+                throw std::runtime_error("Position outside matrix dimensions");
+            }
+
+            if (m_format == MatrixFormat::matrixFormatSparseCSC)
+            {
+                size_t start = m_compIndex[col];
+                size_t end = m_compIndex[col + 1];
+                for (size_t p = start; p < end; p++)
+                {
+                    size_t i = m_unCompIndex[p];
+                    if (i == row)
+                    {
+                        return m_pArray[p];
+                    }
+                }
+
+                return m_default;
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+
    public:
        void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
        void Adagrad(CPUMatrix<ElemType>& c);
@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    public:
        const ElemType* NzValues() const { return m_pArray; }
-        ElemType* NzValues() { return m_pArray; }
+        inline ElemType* NzValues() { return m_pArray; }
        size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use

        CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
@ -139,9 +177,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUSPARSE_INDEX_TYPE *m_unCompIndex; //row/col ids in CSC/CSR format
        CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format

-        size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
+        size_t m_blockSize; //block size
        size_t *m_blockIds; //block ids
+
+        ElemType m_default;
    };

    typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -1,266 +1,265 @@
-//
-// <copyright file="GPUSparseMatrix.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "GPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include <functional>
-
-namespace Microsoft { namespace MSR { namespace CNTK {    
-
-    //GPU Sparse Matrix, using cuSPARSE library.
-    //By default we are assuming CSR representation
-    // NOTE m_elemSizeAllocated (in base matrix) means the number of non-zero elements we have allocated space
-    // We are packing the CSR format (pointed to by m_pArray) as follows:
-    // ElemType elements[m_elemSizeAllocated]
-    // int colIdx[m_elemSizeAllocated]
-    // int rowIdxStart[m_numRows+1]
-
-    template<class ElemType>
-    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
-    {
-        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
-
-    public:
-        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-
-        GPUSparseMatrix(const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR,
-            const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
-    
-        GPUSparseMatrix(const GPUSparseMatrix<ElemType>&);
-
-        GPUSparseMatrix(const GPUMatrix<ElemType>&, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
-
-#ifndef    LINUX
-        GPUSparseMatrix(GPUSparseMatrix<ElemType>&&);
-#endif    /* LINUX */
-
-        ~GPUSparseMatrix();
-
-    public:
-        void Reset();
-
-    public:
-        // return col pointer, which is immediately following the non-zero element
-        // in memory format is always in the following order:
-        // Non-zero data elements, Full index locations, compressed index locations
-        // In CSR row data is compressed, in CSC col data is compressed
-        const ElemType* NzValues() const {return m_pArray;}
-        ElemType* NzValues() {return m_pArray;}
-        size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
-
-        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
-        size_t MajorIndexCount() const { return m_nz; }
-        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
-
-        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
-        size_t SecondaryIndexCount(const size_t numNZ) const 
-        {
-            if (m_format&matrixFormatCompressed)
-            {
-                size_t cnt = (m_format&matrixFormatRowMajor)?m_numRows:m_numCols;
-                if (cnt > 0) cnt++; // add an extra element on the end for the "max" value
-                return cnt;
-            }
-            else
-                return numNZ; // COO format
-        }
-
-        size_t SecondaryIndexCount() const
-        {
-            return SecondaryIndexCount(m_nz);
-        }
-
-        // get size for compressed index
-        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
-
-        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
-        size_t BufferSizeNeeded(const size_t numNZ) const 
-        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
-
-        size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
-        ElemType* BufferPointer() const;
-
-        // the column and row locations will swap based on what format we are in. Full index always follows the data array
-        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
-        size_t RowSize() const {return (m_format&matrixFormatRowMajor)?SecondaryIndexSize():MajorIndexSize();} 
-        GPUSPARSE_INDEX_TYPE* ColLocation() const { return (m_format&matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation(); }
-        size_t ColSize() const {return (m_format&matrixFormatRowMajor)?MajorIndexSize():SecondaryIndexSize();} // actual number of bytes in use
-
-        void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
-        void SetValue(const GPUMatrix<ElemType>& denseMatrix);
-
-        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
-
-        GPUSparseMatrix<ElemType> Transpose() const;
-        void InplaceTranspose();
-        GPUSparseMatrix<ElemType>& AssignTransposeOf(const GPUSparseMatrix<ElemType>& a);
-
-        GPUMatrix<ElemType> CopyToDenseMatrix() const;
-        void CopyToDenseMatrix(GPUMatrix<ElemType> &denseMatrix) const;
-        void CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const;
-        void ChangeDeviceTo(DEVICEID_TYPE toId);
-
-        GPUSparseMatrix<ElemType>& operator=(const GPUSparseMatrix<ElemType>& deepCopy);
-#ifndef    LINUX
-        GPUSparseMatrix<ElemType>& operator=(GPUSparseMatrix<ElemType>&& moveFrom);
-#endif    /* LINUX */
-        GPUSparseMatrix<ElemType> operator+ (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType> operator- (const GPUSparseMatrix<ElemType>& a) const;
-        GPUSparseMatrix<ElemType>& operator^= (const ElemType alpha); //element-wise power        
-        GPUSparseMatrix<ElemType> operator^ (const ElemType alpha) const; //element-wise power
-        GPUSparseMatrix<ElemType>& operator*= (const ElemType alpha);
-        GPUSparseMatrix<ElemType> operator*(const ElemType alpha) const;
-        GPUSparseMatrix<ElemType>& AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power);        
-
-        bool IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-    public:
-        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
-        size_t GetNumNZElements() const {return m_nz;}
-
-        //Sets sparse matrix in CSR format. this acts as deep copy
-        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
-        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
-        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
-        void GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
-
-        void ConvertToSparseFormat(MatrixFormat newFormat);
-        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
-
-    public:
-        GPUSparseMatrix<ElemType>& ElementInverse ();
-        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLinearRectifierDerivative();
-        GPUSparseMatrix<ElemType>& AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSigmoid ();
-        GPUSparseMatrix<ElemType>& AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTanh ();
-        GPUSparseMatrix<ElemType>& AssignTanhOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceSqrt ();
-        GPUSparseMatrix<ElemType>& AssignSqrtOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceExp ();
-        GPUSparseMatrix<ElemType>& AssignExpOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceLog ();
-        GPUSparseMatrix<ElemType>& AssignLogOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceAbs ();   
-        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-        GPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
-        GPUSparseMatrix<ElemType>& AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
-
-        GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
-
-        ElemType SumOfElements () const; //sum of all elements
-        ElemType SumOfAbsElements () const; //sum of all abs(elements)
-        ElemType FrobeniusNorm() const;
-        ElemType MatrixNormInf() const;
-        ElemType MatrixNorm1() const;
-        ElemType MatrixNorm0() const { return (ElemType)GetNumNZElements(); };
-    public:        
-        //Performs C = alpha ∗ op ( S ) ∗ D + beta ∗ C; Where S is sparse and D and C are dense
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, 
-            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
-        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& S, const bool transposeS, const GPUMatrix<ElemType>& D, 
-            const bool transposeD, ElemType beta, GPUMatrix<ElemType>& C);
-        static void MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, 
-            const bool transposeB, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
-        
-        static void ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
-            const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-            const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore);
-        static void ClassEntropyError(GPUSparseMatrix<ElemType>& a);
-        static void ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd);
-        static void ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
-        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd);
-
-        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
-        
-        static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
-        static void Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &C);
-        GPUSparseMatrix<ElemType>& AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB);
-
-        static ElemType InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static ElemType InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-        static void Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a);
-        static void ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-        static bool AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
-
-        //For these two, I should also add a version which would return GPUSparseMatrix, since Dense.*Sparse =Sparse.*Dense=Sparse
-        static GPUMatrix<ElemType> ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
-        static GPUMatrix<ElemType> ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);     
-
-    public:
-        // See: http://stackoverflow.com/questions/4660123/overloading-friend-operator-for-template-class/4661372#4661372
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemTypeDummy>& us);
-        template <class ElemTypeDummy>
-        friend MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemTypeDummy>& us);
-
-     private:
-         void* ReserveTempHostBuffer(const size_t sizeInByte) const;
-         template <class OutType, class InType>
-         static void CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size);
-    private:
-        void ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE deviceId);
-
-    private:
-        void performInplaceFunction(const int kind);
-        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void Clear();
-        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
-        size_t ElemCountFromBufferSize(const size_t totalBufferSize) const;
-        size_t ElemCountFromBufferSize() const;
-        DEVICEID_TYPE PrepareDevice(const DEVICEID_TYPE deviceId = -1) const;
-
-     private:
-
-        size_t m_totalBufferSizeAllocated;
-
-        size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
-        size_t *m_blockIds; //block ids
-        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
-
-        size_t m_expandedSize; // expanded label size
-        size_t* m_block2Id; // label block id to first word location
-        size_t* m_block2UniqId; // label block id to unique first word location        
-
-        mutable void* m_tempHostBuffer; //used to copy values.
-        mutable size_t m_tempHostBufferSize;
-
-        static bool do_sync; 
-    };
-}}}    
-
+//
+// <copyright file="GPUSparseMatrix.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+
+#include "GPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include <functional>
+
+namespace Microsoft { namespace MSR { namespace CNTK {    
+
+    //GPU Sparse Matrix, using cuSPARSE library.
+    //By default we are assuming CSR representation
+    // NOTE m_elemSizeAllocated (in base matrix) means the number of non-zero elements we have allocated space
+    // We are packing the CSR format (pointed to by m_pArray) as follows:
+    // ElemType elements[m_elemSizeAllocated]
+    // int colIdx[m_elemSizeAllocated]
+    // int rowIdxStart[m_numRows+1]
+
+    template<class ElemType>
+    class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
+    {
+        typedef BaseMatrix<ElemType> B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_elemSizeAllocated; using B::m_nz; using B::m_format;   // without this, base members would require to use thi-> in GCC
+
+    public:
+        GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR, const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
+
+        GPUSparseMatrix(const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR,
+            const DEVICEID_TYPE computeDevice = AUTOPLACEMATRIX);
+    
+        GPUSparseMatrix(const GPUSparseMatrix<ElemType>&);
+
+        GPUSparseMatrix(const GPUMatrix<ElemType>&, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
+
+#ifndef    LINUX
+        GPUSparseMatrix(GPUSparseMatrix<ElemType>&&);
+#endif    /* LINUX */
+
+        ~GPUSparseMatrix();
+
+    public:
+        void Reset();
+
+    public:
+        // return col pointer, which is immediately following the non-zero element
+        // in memory format is always in the following order:
+        // Non-zero data elements, Full index locations, compressed index locations
+        // In CSR row data is compressed, in CSC col data is compressed
+        inline const ElemType* NzValues() const {return m_pArray;}
+        inline ElemType* NzValues() {return m_pArray;}
+        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
+
+        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
+        size_t MajorIndexCount() const { return m_nz; }
+        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use
+
+        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
+        size_t SecondaryIndexCount(const size_t numNZ) const 
+        {
+            if (m_format&matrixFormatCompressed)
+            {
+                size_t cnt = (m_format&matrixFormatRowMajor)?m_numRows:m_numCols;
+                if (cnt > 0) cnt++; // add an extra element on the end for the "max" value
+                return cnt;
+            }
+            else
+                return numNZ; // COO format
+        }
+
+        size_t SecondaryIndexCount() const
+        {
+            return SecondaryIndexCount(m_nz);
+        }
+
+        // get size for compressed index
+        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }
+
+        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
+        size_t BufferSizeNeeded(const size_t numNZ) const 
+        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
+
+        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
+        inline ElemType* BufferPointer() const { return m_pArray; }
+
+        // the column and row locations will swap based on what format we are in. Full index always follows the data array
+        GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
+        size_t RowSize() const {return (m_format&matrixFormatRowMajor)?SecondaryIndexSize():MajorIndexSize();} 
+        GPUSPARSE_INDEX_TYPE* ColLocation() const { return (m_format&matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation(); }
+        size_t ColSize() const {return (m_format&matrixFormatRowMajor)?MajorIndexSize():SecondaryIndexSize();} // actual number of bytes in use
+
+        void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
+        void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+        void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
+        void SetValue(const GPUMatrix<ElemType>& denseMatrix);
+
+        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);
+
+        GPUSparseMatrix<ElemType> Transpose() const;
+        void InplaceTranspose();
+        GPUSparseMatrix<ElemType>& AssignTransposeOf(const GPUSparseMatrix<ElemType>& a);
+
+        GPUMatrix<ElemType> CopyToDenseMatrix() const;
+        void CopyToDenseMatrix(GPUMatrix<ElemType> &denseMatrix) const;
+        void CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType> &cpuSparseMatrix) const;
+        void ChangeDeviceTo(DEVICEID_TYPE toId);
+
+        GPUSparseMatrix<ElemType>& operator=(const GPUSparseMatrix<ElemType>& deepCopy);
+#ifndef    LINUX
+        GPUSparseMatrix<ElemType>& operator=(GPUSparseMatrix<ElemType>&& moveFrom);
+#endif    /* LINUX */
+        GPUSparseMatrix<ElemType> operator+ (const GPUSparseMatrix<ElemType>& a) const;
+        GPUSparseMatrix<ElemType> operator- (const GPUSparseMatrix<ElemType>& a) const;
+        GPUSparseMatrix<ElemType>& operator^= (const ElemType alpha); //element-wise power        
+        GPUSparseMatrix<ElemType> operator^ (const ElemType alpha) const; //element-wise power
+        GPUSparseMatrix<ElemType>& operator*= (const ElemType alpha);
+        GPUSparseMatrix<ElemType> operator*(const ElemType alpha) const;
+        GPUSparseMatrix<ElemType>& AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power);        
+
+        bool IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+        bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+    public:
+        virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
+        inline size_t GetNumNZElements() const {return m_nz;}
+
+        //Sets sparse matrix in CSR format. this acts as deep copy
+        void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
+            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
+        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
+            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
+        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
+        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
+        void GetMatrixFromCSRFormat(GPUSPARSE_INDEX_TYPE*& h_CSRRow, GPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+
+        void GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+
+        void ConvertToSparseFormat(MatrixFormat newFormat);
+        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
+
+    public:
+        GPUSparseMatrix<ElemType>& ElementInverse ();
+        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceLinearRectifierDerivative();
+        GPUSparseMatrix<ElemType>& AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceSigmoid ();
+        GPUSparseMatrix<ElemType>& AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceTanh ();
+        GPUSparseMatrix<ElemType>& AssignTanhOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceSqrt ();
+        GPUSparseMatrix<ElemType>& AssignSqrtOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceExp ();
+        GPUSparseMatrix<ElemType>& AssignExpOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceLog ();
+        GPUSparseMatrix<ElemType>& AssignLogOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceAbs ();   
+        GPUSparseMatrix<ElemType>& AssignAbsOf (const GPUSparseMatrix<ElemType>& a);
+
+        GPUSparseMatrix<ElemType>& InplaceTruncate (const ElemType threshold);
+
+        GPUSparseMatrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
+        GPUSparseMatrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
+        GPUSparseMatrix<ElemType>& AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
+
+        GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
+
+        ElemType SumOfElements () const; //sum of all elements
+        ElemType SumOfAbsElements () const; //sum of all abs(elements)
+        ElemType FrobeniusNorm() const;
+        ElemType MatrixNormInf() const;
+        ElemType MatrixNorm1() const;
+        ElemType MatrixNorm0() const { return (ElemType)GetNumNZElements(); };
+    public:        
+        //Performs C = alpha ∗ op ( S ) ∗ D + beta ∗ C; Where S is sparse and D and C are dense
+        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, 
+            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
+        static void MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& S, const bool transposeS, const GPUMatrix<ElemType>& D, 
+            const bool transposeD, ElemType beta, GPUMatrix<ElemType>& C);
+        static void MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, 
+            const bool transposeB, GPUSparseMatrix<ElemType>& c);
+        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
+        
+        static void ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
+            const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+            const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore);
+        static void ClassEntropyError(GPUSparseMatrix<ElemType>& a);
+        static void ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd);
+        static void ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd);
+
+        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
+        
+        static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
+        static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
+        static void Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &C);
+        GPUSparseMatrix<ElemType>& AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB);
+
+        static ElemType InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
+        static ElemType InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);
+        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
+        static void Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a);
+        static void ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c);
+        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+        static bool AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+        static bool AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
+
+        //For these two, I should also add a version which would return GPUSparseMatrix, since Dense.*Sparse =Sparse.*Dense=Sparse
+        static GPUMatrix<ElemType> ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b);
+        static GPUMatrix<ElemType> ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b);     
+
+    public:
+        // See: http://stackoverflow.com/questions/4660123/overloading-friend-operator-for-template-class/4661372#4661372
+        template <class ElemTypeDummy>
+        friend MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemTypeDummy>& us);
+        template <class ElemTypeDummy>
+        friend MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemTypeDummy>& us);
+
+     private:
+         void* ReserveTempHostBuffer(const size_t sizeInByte) const;
+         template <class OutType, class InType>
+         static void CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size);
+    private:
+        void ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE deviceId);
+
+    private:
+        void performInplaceFunction(const int kind);
+        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
+        void Clear();
+        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
+        size_t ElemCountFromBufferSize(const size_t totalBufferSize) const;
+        size_t ElemCountFromBufferSize() const;
+        DEVICEID_TYPE PrepareDevice(const DEVICEID_TYPE deviceId = -1) const;
+
+     private:
+
+        size_t m_totalBufferSizeAllocated;
+
+        size_t m_blockSize; //block size        
+        size_t *m_blockIds; //block ids
+        size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
+
+        size_t m_expandedSize; // expanded label size
+        size_t* m_block2Id; // label block id to first word location
+        size_t* m_block2UniqId; // label block id to unique first word location        
+
+        mutable void* m_tempHostBuffer; //used to copy values.
+        mutable size_t m_tempHostBufferSize;
+
+        static bool do_sync; 
+    };
+}}}    
+
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -733,6 +733,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (m_CPUSparseMatrix == nullptr)
                {
                    m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat); 
+
+                    if (GetMatrixType() == MatrixType::DENSE && m_CPUMatrix != nullptr)
+                    {
+                        m_CPUSparseMatrix->Resize(GetNumRows(), GetNumCols());
+                        CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
+                    }
+                    else
+                    {
+                        // TODO: Assign Sparse from Sparse!
+                    }
+
                    delete m_CPUMatrix;
                    m_CPUMatrix = nullptr;
                }
@ -801,6 +812,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    template<class ElemType>
+    void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
+    {
+        foreach_coord(row, col, from)
+        {
+            auto val = from(row, col);
+            dest.SetValue(row, col, val);
+        }
+    }

    template<class ElemType>
    ElemType Matrix<ElemType>::Get00Element() const
@ -3992,7 +4012,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nullptr,
                return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
                return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
-                NOT_IMPLEMENTED; return false ,
+                return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
                return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold)
                );                
            }
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -1,439 +1,440 @@
-//
-// <copyright file="Matrix.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-
-#pragma once
-
-#include "CPUMatrix.h"
-#include "CPUSparseMatrix.h"
-#include "GPUMatrix.h"
-#include "GPUSparseMatrix.h"
-
-// This class is exported from the Math.dll
-namespace Microsoft { namespace MSR { namespace CNTK {
-    enum CurrentDataLocation
-    {
-        NONE, CPU, GPU, BOTH
-    };
-
-    enum MatrixType
-    { 
-       UNDETERMINED, DENSE, SPARSE
-    };
-
-    //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
-    //convertion is need when passing data between Matrix and C++ matrices
-    //For the best performance compile CNTKMath project with NO_SYNC preprocessor directive
-    //!!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment    
-    template<class ElemType>
-    class MATH_API Matrix 
-    {
-    private:
-        mutable BaseMatrix<ElemType> *m_baseMatrix;
-        mutable GPUMatrix<ElemType> *m_GPUMatrix;
-        mutable CPUMatrix<ElemType> *m_CPUMatrix;
-        mutable GPUSparseMatrix<ElemType> *m_GPUSparseMatrix;
-        mutable CPUSparseMatrix<ElemType> *m_CPUSparseMatrix;
-        mutable MatrixType m_matrixType;
-        mutable CurrentDataLocation m_currentDataLocation; //Indicates which matrix is current        
-        mutable DEVICEID_TYPE m_preferredDeviceId;
-        //Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
-        void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved=true,bool emptyTransfer=false) const; 
-        //Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
-        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
-        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
-
-    public:
-        //Constructors, destructors and other static matrix builders
-        //Each constructor can take deviceId as parameter.
-        //If deviceId<0 then the matrix will be based in RAM (CPUMatrix)
-        //Elseif deviceId>=0 and <AUTOPLACEMATRIX, then the matrix will be based on GPU with specified deviceId
-        //Else (default) if deviceId=AUTOPLACEMATRIX, the class will try to place itself on the best GPU, if fails it will go to CPU
-        //The default behaiviour should be deviceId=AUTOPLACEMATRIX        
-        Matrix(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX); 
-        Matrix(BaseMatrix<ElemType>* baseMatrix, ElemType *pArray, DEVICEID_TYPE deviceId); // constructor for setting Matrix from a base matrix (externally managed butter pArray)
-        Matrix(FILE* f, const char * matrixName, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE); //matrixName is used to verify that correct matrix is read.
-        Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE);
-        Matrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const size_t nnz=0);
-        Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);  //copy constructor, deep copy
-        Matrix<ElemType>& operator=(const Matrix<ElemType>& deepCopyFrom);  //assignment operator, deep copy
-        Matrix(Matrix<ElemType>&& moveFrom);  //move constructor, shallow copy
-        Matrix<ElemType>& operator=(Matrix<ElemType>&& moveFrom);  //move coment operator, shallow copy
-
-        static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
-        void Clear();
-        ~Matrix();
-
-    private:
-        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        void Init(DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
-        void SetDataLocation(CurrentDataLocation location, MatrixType type=UNDETERMINED) const;
-
-    public:
-        MatrixType GetMatrixType() const {return m_matrixType;};
-        bool OwnBuffer() const {return m_baseMatrix->OwnBuffer();}
-        int GetDeviceId() const; //-1 if CPU, otherwise GPU CUDA device id
-        DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; //-1 if CPU, otherwise GPU CUDA device id
-        void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId){ if (m_preferredDeviceId != preferredDeviceId) m_preferredDeviceId = preferredDeviceId; }
-        //Moves matrix from device id_from to device with id_to. 
-        //If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
-        void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved=false, bool emptyTransfer=false, bool updatePreferredDevice=true) const; 
-        CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
-        void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat = matrixFormatSparseCSR); //sets matrix type between dense and sparse
-        size_t GetNumRows() const;
-        size_t GetNumCols() const;
-        size_t GetNumElements() const;
-        wchar_t* GetMatrixName() const;
-        void SetMatrixName(const wchar_t* s);
-        bool IsEmpty() const;  
-        size_t BufferSize() const;
-        ElemType* BufferPointer() const;
-        size_t NzCount() const;
-
-        ElemType* CopyToArray() const; //allocated by the callee but need to be deleted by the caller
-        size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const;  //allocated by the callee but need to be deleted by the caller
-
-        Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
-        Matrix<ElemType>& AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
-
-        void ShiftBy(int numShift) ;
-
-        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
-        void Adagrad(Matrix<ElemType>& gradients);
-        void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
-       
-        void Reshape(const size_t numRows, const size_t numCols);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true);  //by default we only reallocate if need to grow        
-        size_t GetAllocatedSize() const;
-        void Reset(); //reset for sparse matrix
-
-        const ElemType operator() (const size_t row, const size_t col) const;
-        ElemType& operator() (const size_t row, const size_t col);
-        ElemType Get00Element() const;
-
-        void SetValue(const ElemType v);
-        void SetValue(const DeviceBoundNumber<ElemType>& db_number);
-        void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
-        void SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, int deviceId=MANAGEDEXTERN);
-        void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
-        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
-            const size_t nz, const size_t numRows, const size_t numCols);
-        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
-        void SetColumn(const ElemType* colPointer, size_t colInd);
-        void SetColumn(const ElemType val, size_t colInd);
-        void SetColumn(const Matrix<ElemType>& valMat, size_t colInd);
-
-        void SetDiagonalValue(const ElemType v);
-        void SetDiagonalValue(Matrix<ElemType>& vector);
-        void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED);
-        void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
-        void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed=USE_TIME_BASED_SEED); 
-        void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
-
-        Matrix<ElemType> Transpose();
-        Matrix<ElemType>& AssignTransposeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& operator+= (const ElemType alpha);
-        Matrix<ElemType> operator+ (const ElemType alpha) const;
-        Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& operator+= (const Matrix<ElemType>& a);
-        Matrix<ElemType> operator+ (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& operator-= (const ElemType alpha);
-        Matrix<ElemType> operator- (const ElemType alpha) const;
-        Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
-        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);
-
-        Matrix<ElemType>& operator-= (const Matrix<ElemType>& a);
-        Matrix<ElemType> operator- (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& operator*= (const ElemType alpha);
-        Matrix<ElemType> operator* (const ElemType alpha) const;
-        Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);
-
-        Matrix<ElemType> operator* (const Matrix<ElemType>& a) const;
-        Matrix<ElemType>& AssignProductOf (const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB);
-
-        Matrix<ElemType>& operator/= (ElemType alpha);
-        Matrix<ElemType> operator/ (ElemType alpha) const;        
-
-        Matrix<ElemType>& operator^= (ElemType alpha); //element-wise power
-        Matrix<ElemType> operator^ (ElemType alpha) const; //element-wise power
-        Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);
-
-        Matrix<ElemType>& ElementMultiplyWith (const Matrix<ElemType>& a);
-        Matrix<ElemType>& AssignElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& AddElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-
-        Matrix<ElemType>& AssignElementDivisionOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& ElementDivideBy(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ColumnElementMultiplyWith(const Matrix<ElemType>& a);
-        Matrix<ElemType>& RowElementMultiplyWith(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ColumnElementDivideBy(const Matrix<ElemType>& a);
-        Matrix<ElemType>& RowElementDivideBy(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& ElementInverse ();
-        Matrix<ElemType>& AssignElementInverseOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLinearRectifierDerivative();
-        Matrix<ElemType>& AssignLinearRectifierDerivativeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceSigmoidDerivative();
-        Matrix<ElemType>& AssignSigmoidDerivativeOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceSigmoid ();
-        Matrix<ElemType>& AssignSigmoidOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceTanh ();
-        Matrix<ElemType>& AssignTanhOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLogSoftmax (const bool isColWise);
-        Matrix<ElemType>& AssignLogSoftmaxOf (const Matrix<ElemType>& a, const bool isColWise);
-
-        Matrix<ElemType>& InplaceSqrt ();
-        Matrix<ElemType>& AssignSqrtOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceExp ();
-        Matrix<ElemType>& AssignExpOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLog ();
-        Matrix<ElemType>& AssignLogOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceCosine ();
-        Matrix<ElemType>& AssignCosineOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceNegativeSine ();
-        Matrix<ElemType>& AssignNegativeSineOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceLog10 ();
-        Matrix<ElemType>& AssignLog10Of (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceAbs ();
-        Matrix<ElemType>& AssignAbsOf (const Matrix<ElemType>& a);
-
-        Matrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
-        Matrix<ElemType>& AssignTruncateBottomOf (const Matrix<ElemType>& a, const ElemType threshold);
-        Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
-        Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
-        Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
-
-        Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
-
-        DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
-        ElemType SumOfAbsElements () const; //sum of all abs(elements)
-        ElemType SumOfElements () const; //sum of all elements
-        Matrix<ElemType>& AssignSumOfElements(const Matrix<ElemType>& a);
-
-        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
-        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
-        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
-
-        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
-
-        bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
-
-        void VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise);
-
-        void VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise);
-
-        void VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const;
-        Matrix<ElemType>& AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise);
-
-        Matrix<ElemType>& AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise);
-        Matrix<ElemType>& AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        Matrix<ElemType>& AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn);
-
-        Matrix<ElemType>& AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a);
-
-        ElemType FrobeniusNorm() const;
-        Matrix<ElemType>& AssignFrobeniusNormOf(const Matrix<ElemType>& a);
-
-        ElemType MatrixNormInf() const;
-        ElemType MatrixNorm1() const;
-        ElemType MatrixNorm0() const; //number of non-zero elemets
-        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
-        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
-        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
-
-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
-
-        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
-
-        bool HasNan (const char * name) const;
-        size_t CountNanInf() const;
-
-        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
-        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
-
-        Matrix<ElemType>& AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                                 const bool zeroPadding = false); 
-        Matrix<ElemType>& UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
-                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
-                                                 const bool zeroPadding = false) const; 
-        Matrix<ElemType>& AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch, 
-                                                 const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        Matrix<ElemType>& AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch, 
-                                                 const size_t channels, 
-                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
-                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
-                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-    public:
-        ElemType Exp10(ElemType num); 
-        ElemType Mod(ElemType x , ElemType y);
-        ElemType LogAdd(ElemType x, ElemType y);
-
-    public:
-        static DEVICEID_TYPE GetBestGPUDeviceId(); //{ return GPUMatrix<ElemType>::GetBestGPUDeviceId();}
-
-        //static BLAS functions
-
-        // singular value decomposition of A as A = U*SIGMA*VT
-        static void SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT);
-
-        static void MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, 
-            ElemType beta, Matrix<ElemType>& c);
-        static void MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
-        static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
-        static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-
-        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
-        static void AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-        static void AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
-
-        static void AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-        //static void AddLogElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-        static void AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
-
-        static void Scale(ElemType alpha, Matrix<ElemType>& a);
-        static void Scale(Matrix<ElemType>& alpha, Matrix<ElemType>& a); //In this case Matrix alpha must be 1x1
-        static void Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-        static void InnerProduct (const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise);
-        static ElemType InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
-        static void ElementWisePower (ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
-
-        static bool AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold = 1e-8);       
-
-    public:
-        friend File& operator>>(File& stream, Matrix<ElemType>& M)
-        {
-            char type;
-            stream>>type;
-            if (type=='d')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    if (M.m_CPUMatrix==NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
-                    stream>>(*M.m_CPUMatrix);
-                    M.SetDataLocation(CPU, DENSE);
-                }
-                else
-                {
-                    if (M.m_GPUMatrix==NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
-                    stream>>(*M.m_GPUMatrix);  
-                    M.SetDataLocation(GPU, DENSE);
-                }                
-            }
-            else if (type=='s')
-            {
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
-                }
-                else
-                {
-                    if (M.m_GPUSparseMatrix==NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
-                    stream>>(*M.m_GPUSparseMatrix); 
-                    M.SetDataLocation(GPU, SPARSE);
-                }                
-            }
-            else
-                LogicError("wrong matrix type!");
-            return stream;
-
-        }
-        friend File& operator<<(File& stream, const Matrix<ElemType>& M)
-        {
-            if (M.GetMatrixType()==MatrixType::DENSE)
-            {
-                stream<<'d';
-                if (M.GetDeviceId()<0)
-                {
-                    stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUMatrix);
-                }                
-            }
-            else
-            {
-                stream<<'s';
-                if (M.GetDeviceId()<0)
-                {
-                    NOT_IMPLEMENTED;
-                    //stream<<(*M.m_CPUMatrix);
-                }
-                else
-                {
-                    stream<<(*M.m_GPUSparseMatrix);
-                }           
-            }
-            return stream;
-        }
-
-    public:
-        static void ClassEntropy(const Matrix<ElemType>& a, const Matrix<ElemType>& weight,
-            const Matrix<ElemType> & label, const Matrix<ElemType>* cls, 
-            const Matrix<ElemType>* idx2cls, Matrix<ElemType>& etp, Matrix<ElemType>& entropyScore);
-        static void ClassEntropyError(const Matrix<ElemType>& a);
-        static void ClassEntropyGradientOfInput(const Matrix<ElemType>& error, const Matrix<ElemType>& weight, Matrix<ElemType>& grd);
-        static void ClassEntropyGradientOfWeight(
-            const Matrix<ElemType>& error, 
-            const Matrix<ElemType>& input,
-            const Matrix<ElemType>& weight,
-            const Matrix<ElemType> & label, 
-            const Matrix<ElemType>* cls, 
-            const Matrix<ElemType>* idx2cls, 
-            Matrix<ElemType>& grd);
-
-    };
-
-    typedef Matrix<float> SingleMatrix;
-    typedef Matrix<double> DoubleMatrix;
-}}}
+//
+// <copyright file="Matrix.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+
+#include "CPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include "GPUMatrix.h"
+#include "GPUSparseMatrix.h"
+
+// This class is exported from the Math.dll
+namespace Microsoft { namespace MSR { namespace CNTK {
+    enum CurrentDataLocation
+    {
+        NONE, CPU, GPU, BOTH
+    };
+
+    enum MatrixType
+    { 
+       UNDETERMINED, DENSE, SPARSE
+    };
+
+    //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
+    //convertion is need when passing data between Matrix and C++ matrices
+    //For the best performance compile CNTKMath project with NO_SYNC preprocessor directive
+    //!!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment    
+    template<class ElemType>
+    class MATH_API Matrix 
+    {
+    private:
+        mutable BaseMatrix<ElemType> *m_baseMatrix;
+        mutable GPUMatrix<ElemType> *m_GPUMatrix;
+        mutable CPUMatrix<ElemType> *m_CPUMatrix;
+        mutable GPUSparseMatrix<ElemType> *m_GPUSparseMatrix;
+        mutable CPUSparseMatrix<ElemType> *m_CPUSparseMatrix;
+        mutable MatrixType m_matrixType;
+        mutable CurrentDataLocation m_currentDataLocation; //Indicates which matrix is current        
+        mutable DEVICEID_TYPE m_preferredDeviceId;
+        //Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
+        void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved=true,bool emptyTransfer=false) const; 
+        //Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
+        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
+        static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
+
+    public:
+        //Constructors, destructors and other static matrix builders
+        //Each constructor can take deviceId as parameter.
+        //If deviceId<0 then the matrix will be based in RAM (CPUMatrix)
+        //Elseif deviceId>=0 and <AUTOPLACEMATRIX, then the matrix will be based on GPU with specified deviceId
+        //Else (default) if deviceId=AUTOPLACEMATRIX, the class will try to place itself on the best GPU, if fails it will go to CPU
+        //The default behaiviour should be deviceId=AUTOPLACEMATRIX        
+        Matrix(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX); 
+        Matrix(BaseMatrix<ElemType>* baseMatrix, ElemType *pArray, DEVICEID_TYPE deviceId); // constructor for setting Matrix from a base matrix (externally managed butter pArray)
+        Matrix(FILE* f, const char * matrixName, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE); //matrixName is used to verify that correct matrix is read.
+        Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const MatrixType matrixType = DENSE);
+        Matrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const size_t nnz=0);
+        Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);  //copy constructor, deep copy
+        Matrix<ElemType>& operator=(const Matrix<ElemType>& deepCopyFrom);  //assignment operator, deep copy
+        Matrix(Matrix<ElemType>&& moveFrom);  //move constructor, shallow copy
+        Matrix<ElemType>& operator=(Matrix<ElemType>&& moveFrom);  //move coment operator, shallow copy
+
+        static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED, DEVICEID_TYPE deviceId=AUTOPLACEMATRIX);
+        void Clear();
+        ~Matrix();
+
+    private:
+        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        void Init(DEVICEID_TYPE deviceID); //only used internally to initialize a blank matrix
+        void SetDataLocation(CurrentDataLocation location, MatrixType type=UNDETERMINED) const;
+
+    public:
+        MatrixType GetMatrixType() const {return m_matrixType;};
+        bool OwnBuffer() const {return m_baseMatrix->OwnBuffer();}
+        int GetDeviceId() const; //-1 if CPU, otherwise GPU CUDA device id
+        DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; //-1 if CPU, otherwise GPU CUDA device id
+        void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId){ if (m_preferredDeviceId != preferredDeviceId) m_preferredDeviceId = preferredDeviceId; }
+        //Moves matrix from device id_from to device with id_to. 
+        //If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
+        void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved=false, bool emptyTransfer=false, bool updatePreferredDevice=true) const; 
+        CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
+        void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat = matrixFormatSparseCSR); //sets matrix type between dense and sparse
+        size_t GetNumRows() const;
+        size_t GetNumCols() const;
+        size_t GetNumElements() const;
+        wchar_t* GetMatrixName() const;
+        void SetMatrixName(const wchar_t* s);
+        bool IsEmpty() const;  
+        size_t BufferSize() const;
+        ElemType* BufferPointer() const;
+        size_t NzCount() const;
+
+        ElemType* CopyToArray() const; //allocated by the callee but need to be deleted by the caller
+        size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const;  //allocated by the callee but need to be deleted by the caller
+
+        Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
+        Matrix<ElemType>& AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
+
+        void ShiftBy(int numShift) ;
+
+        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
+        void Adagrad(Matrix<ElemType>& gradients);
+        void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
+       
+        void Reshape(const size_t numRows, const size_t numCols);
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true);  //by default we only reallocate if need to grow        
+        size_t GetAllocatedSize() const;
+        void Reset(); //reset for sparse matrix
+
+        const ElemType operator() (const size_t row, const size_t col) const;
+        ElemType& operator() (const size_t row, const size_t col);
+        ElemType Get00Element() const;
+
+        void SetValue(const ElemType v);
+        void SetValue(const DeviceBoundNumber<ElemType>& db_number);
+        void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
+        void SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags=matrixFlagNormal, int deviceId=MANAGEDEXTERN);
+        void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
+        void SetMatrixFromCSCFormat(const GPUSPARSE_INDEX_TYPE *h_CSCCol, const GPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
+            const size_t nz, const size_t numRows, const size_t numCols);
+        void SetMatrixFromLabelAndClass(CPUSPARSE_INDEX_TYPE *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize);
+        void SetColumn(const ElemType* colPointer, size_t colInd);
+        void SetColumn(const ElemType val, size_t colInd);
+        void SetColumn(const Matrix<ElemType>& valMat, size_t colInd);
+
+        void SetDiagonalValue(const ElemType v);
+        void SetDiagonalValue(Matrix<ElemType>& vector);
+        void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed=USE_TIME_BASED_SEED);
+        void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
+        void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed=USE_TIME_BASED_SEED); 
+        void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed=USE_TIME_BASED_SEED);
+
+        Matrix<ElemType> Transpose();
+        Matrix<ElemType>& AssignTransposeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& operator+= (const ElemType alpha);
+        Matrix<ElemType> operator+ (const ElemType alpha) const;
+        Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& operator+= (const Matrix<ElemType>& a);
+        Matrix<ElemType> operator+ (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& operator-= (const ElemType alpha);
+        Matrix<ElemType> operator- (const ElemType alpha) const;
+        Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
+        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);
+
+        Matrix<ElemType>& operator-= (const Matrix<ElemType>& a);
+        Matrix<ElemType> operator- (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& operator*= (const ElemType alpha);
+        Matrix<ElemType> operator* (const ElemType alpha) const;
+        Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);
+
+        Matrix<ElemType> operator* (const Matrix<ElemType>& a) const;
+        Matrix<ElemType>& AssignProductOf (const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB);
+
+        Matrix<ElemType>& operator/= (ElemType alpha);
+        Matrix<ElemType> operator/ (ElemType alpha) const;        
+
+        Matrix<ElemType>& operator^= (ElemType alpha); //element-wise power
+        Matrix<ElemType> operator^ (ElemType alpha) const; //element-wise power
+        Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);
+
+        Matrix<ElemType>& ElementMultiplyWith (const Matrix<ElemType>& a);
+        Matrix<ElemType>& AssignElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& AddElementProductOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+
+        Matrix<ElemType>& AssignElementDivisionOf (const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& ElementDivideBy(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ColumnElementMultiplyWith(const Matrix<ElemType>& a);
+        Matrix<ElemType>& RowElementMultiplyWith(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ColumnElementDivideBy(const Matrix<ElemType>& a);
+        Matrix<ElemType>& RowElementDivideBy(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& ElementInverse ();
+        Matrix<ElemType>& AssignElementInverseOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLinearRectifierDerivative();
+        Matrix<ElemType>& AssignLinearRectifierDerivativeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceSigmoidDerivative();
+        Matrix<ElemType>& AssignSigmoidDerivativeOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceSigmoid ();
+        Matrix<ElemType>& AssignSigmoidOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceTanh ();
+        Matrix<ElemType>& AssignTanhOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLogSoftmax (const bool isColWise);
+        Matrix<ElemType>& AssignLogSoftmaxOf (const Matrix<ElemType>& a, const bool isColWise);
+
+        Matrix<ElemType>& InplaceSqrt ();
+        Matrix<ElemType>& AssignSqrtOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceExp ();
+        Matrix<ElemType>& AssignExpOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLog ();
+        Matrix<ElemType>& AssignLogOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceCosine ();
+        Matrix<ElemType>& AssignCosineOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceNegativeSine ();
+        Matrix<ElemType>& AssignNegativeSineOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceLog10 ();
+        Matrix<ElemType>& AssignLog10Of (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceAbs ();
+        Matrix<ElemType>& AssignAbsOf (const Matrix<ElemType>& a);
+
+        Matrix<ElemType>& InplaceTruncateBottom (const ElemType threshold);
+        Matrix<ElemType>& AssignTruncateBottomOf (const Matrix<ElemType>& a, const ElemType threshold);
+        Matrix<ElemType>& InplaceTruncateTop (const ElemType threshold);
+        Matrix<ElemType>& AssignTruncateTopOf (const Matrix<ElemType>& a, const ElemType threshold);
+        Matrix<ElemType>& InplaceTruncate (const ElemType threshold);
+
+        Matrix<ElemType>& SetToZeroIfAbsLessThan (const ElemType threshold);
+
+        DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
+        ElemType SumOfAbsElements () const; //sum of all abs(elements)
+        ElemType SumOfElements () const; //sum of all elements
+        Matrix<ElemType>& AssignSumOfElements(const Matrix<ElemType>& a);
+
+        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
+        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
+        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+
+        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
+
+        bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
+
+        void VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise);
+
+        void VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise);
+
+        void VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const;
+        Matrix<ElemType>& AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise);
+
+        Matrix<ElemType>& AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise);
+        Matrix<ElemType>& AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        Matrix<ElemType>& AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn);
+
+        Matrix<ElemType>& AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a);
+
+        ElemType FrobeniusNorm() const;
+        Matrix<ElemType>& AssignFrobeniusNormOf(const Matrix<ElemType>& a);
+
+        ElemType MatrixNormInf() const;
+        ElemType MatrixNorm1() const;
+        ElemType MatrixNorm0() const; //number of non-zero elemets
+        Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
+        Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
+        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
+        void VectorMin(Matrix<ElemType>& mainndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
+
+        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b); 
+
+        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
+
+        bool HasNan (const char * name) const;
+        size_t CountNanInf() const;
+
+        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
+        void Print(const char* matrixName = nullptr) const; //print whole matrix. can be expensive
+
+        Matrix<ElemType>& AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                                 const bool zeroPadding = false); 
+        Matrix<ElemType>& UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                                 const bool zeroPadding = false) const; 
+        Matrix<ElemType>& AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch, 
+                                                 const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight,  const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        Matrix<ElemType>& AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch, 
+                                                 const size_t channels, 
+                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+    public:
+        ElemType Exp10(ElemType num); 
+        ElemType Mod(ElemType x , ElemType y);
+        ElemType LogAdd(ElemType x, ElemType y);
+
+    public:
+        static DEVICEID_TYPE GetBestGPUDeviceId(); //{ return GPUMatrix<ElemType>::GetBestGPUDeviceId();}
+
+        //static BLAS functions
+
+        // singular value decomposition of A as A = U*SIGMA*VT
+        static void SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT);
+
+        static void MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, 
+            ElemType beta, Matrix<ElemType>& c);
+        static void MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
+        static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
+        static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
+        static void AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+        static void AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        static void AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+        //static void AddLogElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+        static void AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj); 
+
+        static void Scale(ElemType alpha, Matrix<ElemType>& a);
+        static void Scale(Matrix<ElemType>& alpha, Matrix<ElemType>& a); //In this case Matrix alpha must be 1x1
+        static void Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+        static void InnerProduct (const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise);
+        static ElemType InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+        static void ElementWisePower (ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
+
+        static bool AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold = 1e-8);       
+
+    public:
+        friend File& operator>>(File& stream, Matrix<ElemType>& M)
+        {
+            char type;
+            stream>>type;
+            if (type=='d')
+            {
+                if (M.GetDeviceId()<0)
+                {
+                    if (M.m_CPUMatrix==NULL) M.m_CPUMatrix = new CPUMatrix<ElemType>();
+                    stream>>(*M.m_CPUMatrix);
+                    M.SetDataLocation(CPU, DENSE);
+                }
+                else
+                {
+                    if (M.m_GPUMatrix==NULL) M.m_GPUMatrix = new GPUMatrix<ElemType>();
+                    stream>>(*M.m_GPUMatrix);  
+                    M.SetDataLocation(GPU, DENSE);
+                }                
+            }
+            else if (type=='s')
+            {
+                if (M.GetDeviceId()<0)
+                {
+                    NOT_IMPLEMENTED;//You might want to tranfer your matrix to GPU
+                }
+                else
+                {
+                    if (M.m_GPUSparseMatrix==NULL) M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>();
+                    stream>>(*M.m_GPUSparseMatrix); 
+                    M.SetDataLocation(GPU, SPARSE);
+                }                
+            }
+            else
+                LogicError("wrong matrix type!");
+            return stream;
+
+        }
+        friend File& operator<<(File& stream, const Matrix<ElemType>& M)
+        {
+            if (M.GetMatrixType()==MatrixType::DENSE)
+            {
+                stream<<'d';
+                if (M.GetDeviceId()<0)
+                {
+                    stream<<(*M.m_CPUMatrix);
+                }
+                else
+                {
+                    stream<<(*M.m_GPUMatrix);
+                }                
+            }
+            else
+            {
+                stream<<'s';
+                if (M.GetDeviceId()<0)
+                {
+                    NOT_IMPLEMENTED;
+                    //stream<<(*M.m_CPUMatrix);
+                }
+                else
+                {
+                    stream<<(*M.m_GPUSparseMatrix);
+                }           
+            }
+            return stream;
+        }
+
+    public:
+        static void ClassEntropy(const Matrix<ElemType>& a, const Matrix<ElemType>& weight,
+            const Matrix<ElemType> & label, const Matrix<ElemType>* cls, 
+            const Matrix<ElemType>* idx2cls, Matrix<ElemType>& etp, Matrix<ElemType>& entropyScore);
+        static void ClassEntropyError(const Matrix<ElemType>& a);
+        static void ClassEntropyGradientOfInput(const Matrix<ElemType>& error, const Matrix<ElemType>& weight, Matrix<ElemType>& grd);
+        static void ClassEntropyGradientOfWeight(
+            const Matrix<ElemType>& error, 
+            const Matrix<ElemType>& input,
+            const Matrix<ElemType>& weight,
+            const Matrix<ElemType> & label, 
+            const Matrix<ElemType>* cls, 
+            const Matrix<ElemType>* idx2cls, 
+            Matrix<ElemType>& grd);
+
+    };
+
+    typedef Matrix<float> SingleMatrix;
+    typedef Matrix<double> DoubleMatrix;
+}}}