diff --git a/.gitignore b/.gitignore index 00a6a56c7..c08ca3570 100644 --- a/.gitignore +++ b/.gitignore @@ -1,162 +1,168 @@ -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. - -# User-specific files -*.suo -*.user -*.sln.docstates -*.orig - -# Build results - -[Dd]ebug/ -[Rr]elease/ -x64/ -build/ -[Bb]in/ -[Oo]bj/ - -# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets -!packages/*/build/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -*_i.c -*_p.c -*.ilk -*.meta -*.obj -*.pch -*.pdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.log -*.scc -*.dep - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opensdf -*.sdf -*.cachefile - -# Visual Studio profiler -*.psess -*.vsp -*.vspx - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# NCrunch -*.ncrunch* -.*crunch*.local.xml - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.Publish.xml - -# NuGet Packages Directory -## TODO: If you have NuGet Package Restore enabled, uncomment the next line -#packages/ - -# Windows Azure Build Output -csx -*.build.csdef - -# Windows Store app package directory -AppPackages/ - -# Others -sql/ -*.Cache -ClientBin/ -[Ss]tyle[Cc]op.* -~$* -*~ -*.dbmdl -*.[Pp]ublish.xml -*.pfx -*.publishsettings - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file to a newer -# Visual Studio version. Backup files are not needed, because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm - -# SQL Server files -App_Data/*.mdf -App_Data/*.ldf - - -#LightSwitch generated files -GeneratedArtifacts/ -_Pvt_Extensions/ -ModelManifest.xml - -# ========================= -# Windows detritus -# ========================= - -# Windows image file caches -Thumbs.db -ehthumbs.db - -# Folder config file -Desktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Mac desktop service store files -.DS_Store - -*.lyx~ -*.bak -*.lyx# +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates +*.orig + +# Build results + +[Dd]ebug/ +[Rr]elease/ +x64/ +build/ +[Bb]in/ +[Oo]bj/ + +# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets +!packages/*/build/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.log +*.scc +*.dep + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +*.ncrunch* +.*crunch*.local.xml + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.Publish.xml + +# NuGet Packages Directory +## TODO: If you have NuGet Package Restore enabled, uncomment the next line +#packages/ + +# Windows Azure Build Output +csx +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.[Pp]ublish.xml +*.pfx +*.publishsettings + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +App_Data/*.mdf +App_Data/*.ldf + + +#LightSwitch generated files +GeneratedArtifacts/ +_Pvt_Extensions/ +ModelManifest.xml + +# ========================= +# Windows detritus +# ========================= + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Mac desktop service store files +.DS_Store + +*.lyx~ +*.bak +*.lyx# + +# ========================= +# prebuild file +# ========================= +MachineLearning/cn/buildinfo.h + diff --git a/Common/ConfigFile.cpp b/Common/ConfigFile.cpp index f3902df4b..f5eb505a3 100644 --- a/Common/ConfigFile.cpp +++ b/Common/ConfigFile.cpp @@ -1,279 +1,280 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// ConfigFile.cpp : Defines the configuration file loader. -// - -#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings - -#include "File.h" -#include "commandArgUtil.h" - -namespace Microsoft { namespace MSR { namespace CNTK { - - - // ParseCommandLine - parse the command line parameters - // argc - count of arguments - // argv - array of argument parameters - // config - config to return - std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config) - { - config.SetName(std::string("global")); - - // This vector keeps track of the config files we have already read - std::vector resolvedConfigFiles; - std::string configString; - - // start at 1, because 0 is the name of the EXE - for (int i=1; i < argc; ++i) - { - wstring str = argv[i]; - - // see if they are loading a config file - wstring configDescriptor = L"configFile="; - int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length()); - - // no config file, parse as regular argument - if (compare) - { - configString += (msra::strfun::utf8(str) + "\n"); - } - else // One or more config file paths specified in a "+"-separated list. - { - const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length())); - std::vector filePathsVec = msra::strfun::split(filePaths, "+"); - for (auto filePath : filePathsVec) - { - if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end()) - { - // if haven't already read this file, read it - resolvedConfigFiles.push_back(filePath); - configString += config.ReadConfigFile(filePath); - } - else - RuntimeError("Cannot specify same config file multiple times at the command line."); - } - } - } - - configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles); - config.FileParse(configString); - return configString; - } - - // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the - // form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files. - // If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order), - // and includes the body of each file in the string which is eventually returned by this function. If the included - // config file includes other config files, this function will recursively include those files as well. - // configString - the config string within which to look for "include" statements - // resolvedConfigFiles - the paths to all the config files that have already been resolved. This vector is used to prevent include loops, - // and to prevent files from being included multiple times. - // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files. - std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector &resolvedConfigFiles) - { - std::vector lines = msra::strfun::split(configString, "\n"); - std::string includeKeyword = "include="; - std::size_t includeKeywordSize = includeKeyword.size(); - std::string newConfigString; - for (std::string line : lines) - { - if (line.compare(0, includeKeywordSize, includeKeyword) == 0) - { - std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize); - if (filePaths.find(openBraceVar) != std::string::npos) - { - RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided"); - } - - std::vector filePathVec = msra::strfun::split (filePaths, "+"); - for (auto filePath : filePathVec) - { - // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it. - if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end()) - { - // Recursively resolve the include statements in the included config files. - // Ensure that the same config file isn't included twice, by keeping track of the config - // files that have already been resolved in the resolvedPaths vector. - resolvedConfigFiles.push_back(filePath); - newConfigString += ResolveIncludeStatements( - ReadConfigFile(filePath), - resolvedConfigFiles - ); - } - else - { - // We already resolved this path. Write a warning so that user is aware of this. - // TODO: This message is written to stderr before stderr gets redirected to the specified file. Fix this. - fprintf(stderr, "Warning: Config file included multiple times. Not including config file again: %s", filePath.c_str()); - } - } - } - else - { - newConfigString += (line + "\n"); - } - } - return newConfigString; - } - - // LoadConfigFiles - load multiple configuration file, and adds to config parameters - // filePaths - A "+" delimited list of file paths, corresponding to config files to load - // configStringToAppend - A config string which should be processed together with the config files - void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend) - { - std::string configString = ReadConfigFiles(filePaths); - if(configStringToAppend != nullptr) - { - configString += *configStringToAppend; - } - - FileParse(configString); - } - - // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters. - // If the config file contains references to variables, which are defined in the 'config' ConfigParameters, - // then this method will resolve those variables. This method is meant for the processing of NDL/MEL config files, - // in order to allow them to access variables defined in the primary config file via $varName$ syntax. - // filePath - filePath to the file to load - // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file. - void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config) - { - // read file, resolve variables, and then parse. - std::string fileContents = ReadConfigFile(filePath); - fileContents = config.ResolveVariables(fileContents); - FileParse(fileContents); - } - - // LoadConfigFile - load a configuration file, and add to config parameters - // filePath - filePath to the file to read - void ConfigParser::LoadConfigFile(const std::wstring &filePath) - { - // read and then parse - FileParse(ReadConfigFile(filePath)); - } - - // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring - std::string ConfigParser::ReadConfigFiles(const std::string &filePaths) - { - return ReadConfigFiles(msra::strfun::utf16(filePaths)); - } - - // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string - // filePaths - A "+" delimited list of file paths, corresponding to config files to read - // returns: a string with the concatentated file contents - std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths) - { - std::string configString; - std::vector filePathVec = msra::strfun::split (filePaths, L"+"); - for (auto filePath : filePathVec) - { - configString += ReadConfigFile(filePath); - } - return configString; - } - - // Same as "ReadConfigFile" function below, but takes as input string instead of wstring - std::string ConfigParser::ReadConfigFile(const std::string &filePath) - { - return ReadConfigFile(msra::strfun::utf16(filePath)); - } - - // ReadConfigFile - read a configuration file, and return as a string - // filePath - the path to the config file to read - // returns: a string with the concatentated file contents - std::string ConfigParser::ReadConfigFile(const std::wstring &filePath) - { - File file(filePath, fileOptionsRead); - - // initialize with file name - std::string path = msra::strfun::utf8(filePath); - auto location = path.find_last_of("/\\"); - if (location != npos) - path = path.substr(location+1); - m_configName = move(path); - - // read the entire file into a string - // CONSIDER: should the File API support this, instead of line by line? - size_t fileLength = file.Size(); - string str; - string configFile; - configFile.reserve(fileLength); - while (!file.IsEOF()) - { - file.GetLine(str); - str = PreprocessConfigLine(str); - if (str != "") - { - configFile.append(str); - configFile.append("\n"); - } - } - return configFile; - } - - // GetFileConfigNames - determine the names of the features and labels sections in the config file - // features - [in,out] a vector of feature name strings - // labels - [in,out] a vector of label name strings - void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels) - { - for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) - { - auto pair = *iter; - ConfigParameters temp (iter->second); - // see if we have a config parameters that contains a "dim" element, it's a sub key, use it - if (temp.ExistsCurrent("dim")) - { - if (temp.ExistsCurrent("labelMappingFile") - || temp.ExistsCurrent("labelDim") - || temp.ExistsCurrent("labelType") - || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels")) - { - labels.push_back(msra::strfun::utf16(iter->first)); - } - else - { - features.push_back(msra::strfun::utf16(iter->first)); - } - } - } - } - - // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key - // config - configuration to search - // key - string we ar searching for in each config section - // names - [in,out] a vector of section names in "path" format (i.e. base\subsection) - void FindConfigNames(const ConfigParameters& config, std::string key, std::vector& names) - { - for (auto iter = config.begin(); iter != config.end(); ++iter) - { - auto pair = *iter; - ConfigParameters temp (iter->second); - // see if we have a config parameters that contains a "key" element, if so use it - if (temp.ExistsCurrent(key)) - { - names.push_back(msra::strfun::utf16(iter->first)); - } - } - } - - // Trim - trim white space off the start and end of the string - // str - string to trim - // NOTE: if the entire string is empty, then the string will be set to an empty string - void Trim(std::string& str) - { - auto found = str.find_first_not_of(" \t"); - if (found == npos) - { - str.erase(0); - return; - } - str.erase(0, found); - found = str.find_last_not_of(" \t"); - if (found != npos) - str.erase(found+1); - } - +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// ConfigFile.cpp : Defines the configuration file loader. +// +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#endif + +#include "File.h" +#include "commandArgUtil.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + + + // ParseCommandLine - parse the command line parameters + // argc - count of arguments + // argv - array of argument parameters + // config - config to return + std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config) + { + config.SetName(std::string("global")); + + // This vector keeps track of the config files we have already read + std::vector resolvedConfigFiles; + std::string configString; + + // start at 1, because 0 is the name of the EXE + for (int i=1; i < argc; ++i) + { + wstring str = argv[i]; + + // see if they are loading a config file + wstring configDescriptor = L"configFile="; + int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length()); + + // no config file, parse as regular argument + if (compare) + { + configString += (msra::strfun::utf8(str) + "\n"); + } + else // One or more config file paths specified in a "+"-separated list. + { + const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length())); + std::vector filePathsVec = msra::strfun::split(filePaths, "+"); + for (auto filePath : filePathsVec) + { + if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end()) + { + // if haven't already read this file, read it + resolvedConfigFiles.push_back(filePath); + configString += config.ReadConfigFile(filePath); + } + else + RuntimeError("Cannot specify same config file multiple times at the command line."); + } + } + } + + configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles); + config.FileParse(configString); + return configString; + } + + // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the + // form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files. + // If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order), + // and includes the body of each file in the string which is eventually returned by this function. If the included + // config file includes other config files, this function will recursively include those files as well. + // configString - the config string within which to look for "include" statements + // resolvedConfigFiles - the paths to all the config files that have already been resolved. This vector is used to prevent include loops, + // and to prevent files from being included multiple times. + // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files. + std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector &resolvedConfigFiles) + { + std::vector lines = msra::strfun::split(configString, "\n"); + std::string includeKeyword = "include="; + std::size_t includeKeywordSize = includeKeyword.size(); + std::string newConfigString; + for (std::string line : lines) + { + if (line.compare(0, includeKeywordSize, includeKeyword) == 0) + { + std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize); + if (filePaths.find(openBraceVar) != std::string::npos) + { + RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided"); + } + + std::vector filePathVec = msra::strfun::split (filePaths, "+"); + for (auto filePath : filePathVec) + { + // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it. + if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end()) + { + // Recursively resolve the include statements in the included config files. + // Ensure that the same config file isn't included twice, by keeping track of the config + // files that have already been resolved in the resolvedPaths vector. + resolvedConfigFiles.push_back(filePath); + newConfigString += ResolveIncludeStatements( + ReadConfigFile(filePath), + resolvedConfigFiles + ); + } + else + { + // We already resolved this path. Write a warning so that user is aware of this. + // TODO: This message is written to stderr before stderr gets redirected to the specified file. Fix this. + fprintf(stderr, "Warning: Config file included multiple times. Not including config file again: %s", filePath.c_str()); + } + } + } + else + { + newConfigString += (line + "\n"); + } + } + return newConfigString; + } + + // LoadConfigFiles - load multiple configuration file, and adds to config parameters + // filePaths - A "+" delimited list of file paths, corresponding to config files to load + // configStringToAppend - A config string which should be processed together with the config files + void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend) + { + std::string configString = ReadConfigFiles(filePaths); + if(configStringToAppend != nullptr) + { + configString += *configStringToAppend; + } + + FileParse(configString); + } + + // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters. + // If the config file contains references to variables, which are defined in the 'config' ConfigParameters, + // then this method will resolve those variables. This method is meant for the processing of NDL/MEL config files, + // in order to allow them to access variables defined in the primary config file via $varName$ syntax. + // filePath - filePath to the file to load + // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file. + void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config) + { + // read file, resolve variables, and then parse. + std::string fileContents = ReadConfigFile(filePath); + fileContents = config.ResolveVariables(fileContents); + FileParse(fileContents); + } + + // LoadConfigFile - load a configuration file, and add to config parameters + // filePath - filePath to the file to read + void ConfigParser::LoadConfigFile(const std::wstring &filePath) + { + // read and then parse + FileParse(ReadConfigFile(filePath)); + } + + // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring + std::string ConfigParser::ReadConfigFiles(const std::string &filePaths) + { + return ReadConfigFiles(msra::strfun::utf16(filePaths)); + } + + // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string + // filePaths - A "+" delimited list of file paths, corresponding to config files to read + // returns: a string with the concatentated file contents + std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths) + { + std::string configString; + std::vector filePathVec = msra::strfun::split (filePaths, L"+"); + for (auto filePath : filePathVec) + { + configString += ReadConfigFile(filePath); + } + return configString; + } + + // Same as "ReadConfigFile" function below, but takes as input string instead of wstring + std::string ConfigParser::ReadConfigFile(const std::string &filePath) + { + return ReadConfigFile(msra::strfun::utf16(filePath)); + } + + // ReadConfigFile - read a configuration file, and return as a string + // filePath - the path to the config file to read + // returns: a string with the concatentated file contents + std::string ConfigParser::ReadConfigFile(const std::wstring &filePath) + { + File file(filePath, fileOptionsRead); + + // initialize with file name + std::string path = msra::strfun::utf8(filePath); + auto location = path.find_last_of("/\\"); + if (location != npos) + path = path.substr(location+1); + m_configName = move(path); + + // read the entire file into a string + // CONSIDER: should the File API support this, instead of line by line? + size_t fileLength = file.Size(); + string str; + string configFile; + configFile.reserve(fileLength); + while (!file.IsEOF()) + { + file.GetLine(str); + str = PreprocessConfigLine(str); + if (str != "") + { + configFile.append(str); + configFile.append("\n"); + } + } + return configFile; + } + + // GetFileConfigNames - determine the names of the features and labels sections in the config file + // features - [in,out] a vector of feature name strings + // labels - [in,out] a vector of label name strings + void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels) + { + for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) + { + auto pair = *iter; + ConfigParameters temp (iter->second); + // see if we have a config parameters that contains a "dim" element, it's a sub key, use it + if (temp.ExistsCurrent("dim")) + { + if (temp.ExistsCurrent("labelMappingFile") + || temp.ExistsCurrent("labelDim") + || temp.ExistsCurrent("labelType") + || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels")) + { + labels.push_back(msra::strfun::utf16(iter->first)); + } + else + { + features.push_back(msra::strfun::utf16(iter->first)); + } + } + } + } + + // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key + // config - configuration to search + // key - string we ar searching for in each config section + // names - [in,out] a vector of section names in "path" format (i.e. base\subsection) + void FindConfigNames(const ConfigParameters& config, std::string key, std::vector& names) + { + for (auto iter = config.begin(); iter != config.end(); ++iter) + { + auto pair = *iter; + ConfigParameters temp (iter->second); + // see if we have a config parameters that contains a "key" element, if so use it + if (temp.ExistsCurrent(key)) + { + names.push_back(msra::strfun::utf16(iter->first)); + } + } + } + + // Trim - trim white space off the start and end of the string + // str - string to trim + // NOTE: if the entire string is empty, then the string will be set to an empty string + void Trim(std::string& str) + { + auto found = str.find_first_not_of(" \t"); + if (found == npos) + { + str.erase(0); + return; + } + str.erase(0, found); + found = str.find_last_not_of(" \t"); + if (found != npos) + str.erase(found+1); + } + }}} \ No newline at end of file diff --git a/Common/File.cpp b/Common/File.cpp index fc77c0af3..896b5dd22 100644 --- a/Common/File.cpp +++ b/Common/File.cpp @@ -1,631 +1,633 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - -#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings - -#include "basetypes.h" -#define FORMAT_SPECIALIZE // to get the specialized version of the format routines -#include "fileutil.h" -#include "File.h" -#include -#include -#include -#ifdef _WIN32 -#include -#endif -#ifdef __unix__ -#include -#endif - -namespace Microsoft{ namespace MSR { namespace CNTK { - -// File creation -// filename - the path -// fileOptions - options to open the file -File::File(const std::wstring& filename, int fileOptions) -{ - Init(filename.c_str(), fileOptions); -} - -File::File(const std::string& filename, int fileOptions) -{ - // this converts from string to wstring, and then to wchar_t* - Init(msra::strfun::utf16(filename).c_str(), fileOptions); -} - -File::File(const wchar_t* filename, int fileOptions) -{ - Init(filename, fileOptions); -} - -void File::Init(const wchar_t* filename, int fileOptions) -{ - msra::files::make_intermediate_dirs(filename); - // translate the options string into a string for fopen() - wstring options = fileOptions&fileOptionsRead?L"r":L""; - if (fileOptions&fileOptionsWrite) - { - // if we already are reading the file, change to read/write - options.clear(); - options.append(L"w+"); - } - if (fileOptions&fileOptionsBinary) - { - options += L"b"; - } - else - { - if (fileOptions & fileOptionsUnicode) - options += L"b"; - else - options += L"t"; - // I attempted to use the translated characterset modes, but encountered strange errors - //options += L"t, ccs="; - //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8"; - } - // add sequential flag to allocate big read buffer - if (fileOptions & fileOptionsSequential) - options += L"S"; - - attempt([=](){m_file = fopenOrDie(filename, options.c_str());}); - m_options = fileOptions; - m_size = filesize(m_file); -} - -void File::goToDelimiter(int delim) -{ - int ch=0; - - while (ch!=delim) { - ch=fgetc(m_file); - if (feof(m_file)) { - printf("Unexpected end of file\n"); - throw std::logic_error("Unexpected end of file\n"); - } - } -} - -bool File::IsTextBased() -{ - return !!(m_options & (fileOptionsText|fileOptionsUnicode)); -} - -// File Destructor -// closes the file -File::~File(void) -{ - attempt([=] {fcloseOrDie(m_file);}); -} - -// GetLine - get a line from the file -// str - string to store the line -void File::GetLine(wstring& str) -{ - str = fgetlinew(m_file); -} - -// GetLine - get a line from the file -// str - string -void File::GetLine(string& str) -{ - str = fgetline(m_file); -} - -// Put a zero/space terminated wstring into a file -// val - value to write to the file -File& File::operator<<(const std::wstring& val) -{ - WriteString(val.c_str()); - return *this; -} - - -// Put a zero/space terminated string into a file -// val - value to write to the file -File& File::operator<<(const std::string& val) -{ - WriteString(val.c_str()); - return *this; -} - -// Put a marker in the file, the marker depends on the file type -// marker - marker to place in the file -File& File::operator<<(FileMarker marker) -{ - File& file = *this; - switch(marker) - { - case fileMarkerBeginFile: // beginning of file marker - // only exists for UNICODE files - if (m_options & fileOptionsUnicode) - file << (unsigned int)0xfeff; // byte order mark - break; - case fileMarkerEndFile: // end of file marker - // use ^Z for end of file for text files - if (m_options & fileOptionsUnicode) - file << wchar_t(26); // ^Z - else if (m_options & fileOptionsText) - file << char(26); - break; - case fileMarkerBeginList: // Beginning of list marker - // no marker written for either - break; - case fileMarkerListSeparator: // separate elements of a list - // do nothing for now, built in space deliminter for all types (before type) - // future: make this customizable, so you can specify a separator (i.e. ',') - break; - case fileMarkerEndList: // end of line/list marker - if (m_options & fileOptionsUnicode) - file.WriteString(L"\r\n"); // carriage return/life feed - else if (m_options & fileOptionsText) - file.WriteString("\r\n"); - break; - case fileMarkerBeginSection: // beginning of section - case fileMarkerEndSection: // end of section - assert(false); // sections should use a string modifier - break; - } - return file; -} - -// PutMarker for beginning of list support (lists with a count) -// count - [in] the number of elements in the list -File& File::PutMarker(FileMarker marker, size_t count) -{ - assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count markers - *this << count; - return *this; -} - -// PutMarker for section beginning and ending tags -// section - [in]name of section -File& File::PutMarker(FileMarker marker, const std::string& section) -{ - File& file = *this; - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - file << section; - return file; -} - -// PutMarker for section beginning and ending tags -// section - [in]name of section -File& File::PutMarker(FileMarker marker, const std::wstring& section) -{ - File& file = *this; - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - file << section; - return file; -} - -// Get a zero terminated wstring from a file -// val - value to read from the file -File& File::operator>>(std::wstring& val) -{ - attempt([&]{ - if (IsTextBased()) - val = fgetwtoken(m_file); - else - val = fgetwstring(m_file); - }); - return *this; -} - -// Get a zero terminated string from a file -// val - value to read from the file -File& File::operator>>(std::string& val) -{ - attempt([&]{ - if (IsTextBased()) - val = fgettoken(m_file); - else - val = fgetstring(m_file); - }); - return *this; -} - -// ReadChars - read a specified number of characters, and reset read pointer if requested -// val - [in,out] return value will be returned here -// cnt - number of characters to read -// reset - reset the read pointer -void File::ReadChars(std::string& val, size_t cnt, bool reset) -{ - size_t pos = 0; // (initialize to keep compiler happy) - if (reset) - pos = GetPosition(); - val.resize(cnt); - char *str = const_cast(val.c_str()); - for (int i=0;i < cnt;++i) - *this >> str[i]; - if (reset) - SetPosition(pos); -} - -// ReadChars - read a specified number of characters, and reset read pointer if requested -// val - [in,out] return value will be returned here -// cnt - number of characters to read -// reset - reset the read pointer -void File::ReadChars(std::wstring& val, size_t cnt, bool reset) -{ - size_t pos = 0; // (initialize to keep compiler happy) - if (reset) - pos = GetPosition(); - val.resize(cnt); - wchar_t *str = const_cast(val.c_str()); - for (int i=0;i < cnt;++i) - *this >> str[i]; - if (reset) - SetPosition(pos); -} - -// WriteString - outputs a string into the file -// str - the string to output -// size - size of the string to output, if zero null terminated -void File::WriteString(const char* str, int size) -{ - attempt([&]{ - if (size > 0) - { - fwprintf(m_file, L" %.*hs", size, str); - } - else - { - if (IsTextBased()) - fwprintf(m_file, L" %hs", str); - else - fputstring (m_file, str); - } - }); -} - -// ReadString - reads a string into the file -// str - the string buffer to read the string into -// size - size of the string string buffer -void File::ReadString(char* str, int size) -{ - attempt([&]{ - if (IsTextBased()) - fgettoken(m_file, str, size); - else - fgetstring (m_file, str, size); - }); -} - -// WriteString - outputs a string into the file -// if writing to text based file and spaces are embedded, writes quotes around string -// str - the string to output -// size - size of the string to output, if zero null terminated -void File::WriteString(const wchar_t* str, int size) -{ - attempt([&]{ -#ifdef EMBEDDED_SPACES - // start of implementation of embedded space support with quoting - // not complete, not sure if we need it - bool spacefound = false; - wchar_t quote = 0; - if (IsTextBased()) - { - // search for embedded spaces and quotes - wstring searchString = L" \"'~"; - const wchar_t* result = NULL; - while (result = wcspbrk(str, searchString.c_str())) - { - if (IsWhiteSpace(*result)) - spacefound = true; - searchString.find(*result, 0); - } - } -#endif - if (size > 0) - { - fwprintf(m_file, L" %.*ls", size, str); - } - else - { - if (IsTextBased()) - fwprintf(m_file, L" %ls", str); - else - fputstring (m_file, str); - } - }); -} - -// ReadString - reads a string into the file -// str - the string buffer to read the string into -// size - size of the string string buffer -void File::ReadString(wchar_t* str, int size) -{ - attempt([&]{ - if (IsTextBased()) - fgettoken(m_file, str, size); - else - fgetstring (m_file, str, size); - }); -} - -// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark? -// skip - skip the BOM mark if found (defaults to false) -// returns - true if on a unicode BOM -bool File::IsUnicodeBOM(bool skip) -{ - File& file = *this; - uint64_t pos = GetPosition(); - // if we aren't at the beginning of the file, it can't be the byte order mark - if (pos != 0) - return false; - - // only exists for UNICODE files - bool found = false; - if (m_options & fileOptionsUnicode) - { - unsigned int bom=0; - if (IsTextBased()) - ftrygetText(m_file, bom); - else - fget(m_file, bom); - // future: one reason for the BOM is to detect other-endian files, should we support? - found = (bom == 0xfeff); - } - else if (m_options & fileOptionsText) - { - char val[3]; - file.ReadString(val, 3); - found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF); - } - // restore pointer if no BOM or we aren't skipping it - if (!found || !skip) - { - SetPosition(pos); - } - return found; -} - -//Size - return the size of the file -// WARNING: calling this will reset the EOF marker, so do so with care -size_t File::Size() -{ - return filesize(m_file); -} - -// IsEOF - if we have read past the end of the file -// return - true if end of file has been found -bool File::IsEOF() -{ - return !!feof(m_file); -} - -// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)? -// skip - skip the whitespace if found (defaults to false) -// returns - true if whitespace found -bool File::IsWhiteSpace(bool skip) -{ - bool spaceFound = false; - bool spaceCur = false; - if (m_options & fileOptionsUnicode) - { - wint_t c; - do - { - c = fgetwc (m_file); - if (c == WEOF) // hit the end - return spaceFound; - spaceCur = !!iswspace(c); - spaceFound = spaceFound || spaceCur; - } while (spaceCur && skip); - // put back the last character (WEOF is ignored) - ungetwc(c, m_file); - } - else - { - int c; - do - { - c = fgetc (m_file); - if (c == EOF) // hit the end - return spaceFound; - spaceCur = !!isspace(c); - spaceFound = spaceFound || spaceCur; - } while (spaceCur && skip); - // put back the last character (EOF is ignored) - ungetc(c, m_file); - } - - return spaceFound; -} - -// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too -// skip - skip the end of line if found (defaults to false) -// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped -int File::EndOfLineOrEOF(bool skip) -{ - int found = false; - if (m_options & fileOptionsUnicode) - found = fskipwNewline(m_file,skip); - else if (m_options & fileOptionsText) - found = fskipNewline(m_file, skip); - return found; -} - - -// Get a marker from the file -// some are ignored others are expecting characters -// must use GetMarker methods for those that require parameters -File& File::operator>>(FileMarker marker) -{ - File& file = *this; - - switch(marker) - { - case fileMarkerBeginFile: // beginning of file marker - // check for Unicode BOM marker - if (IsTextBased()) - IsUnicodeBOM(true); - break; - case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file? - if (!IsEOF()) - throw std::runtime_error("fileMarkerEndFile not found"); - break; - case fileMarkerBeginList: // Beginning of list marker - // no marker written unless an list with a count header - break; - case fileMarkerListSeparator: // separate elements of a list - // do nothing for now, built in space deliminter for all types (before type) - // future: make this customizable, so you can specify a separator (i.e. ',') - break; - case fileMarkerEndList: // end of line/list marker - if (IsTextBased()) - { - int found = EndOfLineOrEOF(true); - if (found != (int)true) // EOF can also be returned - throw std::runtime_error("Newline not found"); - } - break; - case fileMarkerBeginSection: // beginning of section - case fileMarkerEndSection: // end of section - assert(false); // sections should use a string modifier - break; - } - return file; -} - -// Get a marker from the file -// some are ignored others are expecting characters -// must use GetMarker methods for those that require parameters -bool File::IsMarker(FileMarker marker, bool skip) -{ - bool retval = false; - switch(marker) - { - case fileMarkerBeginFile: // beginning of file marker - // check for Unicode BOM marker - retval = IsUnicodeBOM(skip); - break; - case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file? - retval = IsEOF(); - break; - case fileMarkerBeginList: // Beginning of list marker - // no marker written unless an list with a count header - // should we try to validate BOL header (just know it's an int, not negative, etc.) - break; - case fileMarkerListSeparator: // separate elements of a list - // do nothing for now, built in space deliminter for all types (before type) - // future: make this customizable, so you can specify a separator (i.e. ',') - break; - case fileMarkerEndList: // end of line/list marker - if (IsTextBased()) - { - int eolSeen = false; - eolSeen = EndOfLineOrEOF(skip); - retval = (eolSeen == (int)true); - } - break; - case fileMarkerBeginSection: // beginning of section - case fileMarkerEndSection: // end of section - // can't destinquish from a string currently - break; - } - return retval; -} - - -// GetMarker for beginning of list support (lists with a count) -// count - [out] returns the number of elements in the list -File& File::GetMarker(FileMarker marker, size_t& count) -{ - assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers - // use text based try, so it can fail without an exception - if (IsTextBased()) - ftrygetText(m_file, count); - else - fget(m_file, count); - return *this; -} - -// GetMarker for section beginning and ending tags -// section - [in]name of section that is expected -File& File::GetMarker(FileMarker marker, const std::string& section) -{ - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - string str; - *this >> str; - if (str != section) - throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section); - return *this; -} - -// GetMarker for section beginning and ending tags -// section - [in]name of section that is expected -File& File::GetMarker(FileMarker marker, const std::wstring& section) -{ - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - wstring str; - *this >> str; - if (str != section) - throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section)); - return *this; -} - -// TryGetMarker for section beginning and ending tags -// section - [in]name of section that is expected -bool File::TryGetMarker(FileMarker marker, const std::wstring& section) -{ - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - size_t pos = GetPosition(); - std::wstring str; - try - { - *this >> str; - if (str == section) - return true; - } - catch(...) - { - //eat - } - SetPosition(pos); - return false; -} - -// TryGetMarker for section beginning and ending tags -// section - [in]name of section that is expected -bool File::TryGetMarker(FileMarker marker, const std::string& section) -{ - // only the section markers take a string parameter - assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; - size_t pos = GetPosition(); - std::string str; - try - { - *this >> str; - if (str == section) - return true; - } - catch(...) - { - return false; - } - SetPosition(pos); - return false; -} - -// GetPosition - Get position in a file -uint64_t File::GetPosition() -{ - return fgetpos(m_file); -} - -// Set the position in the file -// pos - position in the file -void File::SetPosition(uint64_t pos) -{ - fsetpos (m_file, pos); -} - -}}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// + +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#endif + +#include "basetypes.h" +#define FORMAT_SPECIALIZE // to get the specialized version of the format routines +#include "fileutil.h" +#include "File.h" +#include +#include +#include +#ifdef _WIN32 +#include +#endif +#ifdef __unix__ +#include +#endif + +namespace Microsoft{ namespace MSR { namespace CNTK { + +// File creation +// filename - the path +// fileOptions - options to open the file +File::File(const std::wstring& filename, int fileOptions) +{ + Init(filename.c_str(), fileOptions); +} + +File::File(const std::string& filename, int fileOptions) +{ + // this converts from string to wstring, and then to wchar_t* + Init(msra::strfun::utf16(filename).c_str(), fileOptions); +} + +File::File(const wchar_t* filename, int fileOptions) +{ + Init(filename, fileOptions); +} + +void File::Init(const wchar_t* filename, int fileOptions) +{ + msra::files::make_intermediate_dirs(filename); + // translate the options string into a string for fopen() + wstring options = fileOptions&fileOptionsRead?L"r":L""; + if (fileOptions&fileOptionsWrite) + { + // if we already are reading the file, change to read/write + options.clear(); + options.append(L"w+"); + } + if (fileOptions&fileOptionsBinary) + { + options += L"b"; + } + else + { + if (fileOptions & fileOptionsUnicode) + options += L"b"; + else + options += L"t"; + // I attempted to use the translated characterset modes, but encountered strange errors + //options += L"t, ccs="; + //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8"; + } + // add sequential flag to allocate big read buffer + if (fileOptions & fileOptionsSequential) + options += L"S"; + + attempt([=](){m_file = fopenOrDie(filename, options.c_str());}); + m_options = fileOptions; + m_size = filesize(m_file); +} + +void File::goToDelimiter(int delim) +{ + int ch=0; + + while (ch!=delim) { + ch=fgetc(m_file); + if (feof(m_file)) { + printf("Unexpected end of file\n"); + throw std::logic_error("Unexpected end of file\n"); + } + } +} + +bool File::IsTextBased() +{ + return !!(m_options & (fileOptionsText|fileOptionsUnicode)); +} + +// File Destructor +// closes the file +File::~File(void) +{ + attempt([=] {fcloseOrDie(m_file);}); +} + +// GetLine - get a line from the file +// str - string to store the line +void File::GetLine(wstring& str) +{ + str = fgetlinew(m_file); +} + +// GetLine - get a line from the file +// str - string +void File::GetLine(string& str) +{ + str = fgetline(m_file); +} + +// Put a zero/space terminated wstring into a file +// val - value to write to the file +File& File::operator<<(const std::wstring& val) +{ + WriteString(val.c_str()); + return *this; +} + + +// Put a zero/space terminated string into a file +// val - value to write to the file +File& File::operator<<(const std::string& val) +{ + WriteString(val.c_str()); + return *this; +} + +// Put a marker in the file, the marker depends on the file type +// marker - marker to place in the file +File& File::operator<<(FileMarker marker) +{ + File& file = *this; + switch(marker) + { + case fileMarkerBeginFile: // beginning of file marker + // only exists for UNICODE files + if (m_options & fileOptionsUnicode) + file << (unsigned int)0xfeff; // byte order mark + break; + case fileMarkerEndFile: // end of file marker + // use ^Z for end of file for text files + if (m_options & fileOptionsUnicode) + file << wchar_t(26); // ^Z + else if (m_options & fileOptionsText) + file << char(26); + break; + case fileMarkerBeginList: // Beginning of list marker + // no marker written for either + break; + case fileMarkerListSeparator: // separate elements of a list + // do nothing for now, built in space deliminter for all types (before type) + // future: make this customizable, so you can specify a separator (i.e. ',') + break; + case fileMarkerEndList: // end of line/list marker + if (m_options & fileOptionsUnicode) + file.WriteString(L"\r\n"); // carriage return/life feed + else if (m_options & fileOptionsText) + file.WriteString("\r\n"); + break; + case fileMarkerBeginSection: // beginning of section + case fileMarkerEndSection: // end of section + assert(false); // sections should use a string modifier + break; + } + return file; +} + +// PutMarker for beginning of list support (lists with a count) +// count - [in] the number of elements in the list +File& File::PutMarker(FileMarker marker, size_t count) +{ + assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count markers + *this << count; + return *this; +} + +// PutMarker for section beginning and ending tags +// section - [in]name of section +File& File::PutMarker(FileMarker marker, const std::string& section) +{ + File& file = *this; + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + file << section; + return file; +} + +// PutMarker for section beginning and ending tags +// section - [in]name of section +File& File::PutMarker(FileMarker marker, const std::wstring& section) +{ + File& file = *this; + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + file << section; + return file; +} + +// Get a zero terminated wstring from a file +// val - value to read from the file +File& File::operator>>(std::wstring& val) +{ + attempt([&]{ + if (IsTextBased()) + val = fgetwtoken(m_file); + else + val = fgetwstring(m_file); + }); + return *this; +} + +// Get a zero terminated string from a file +// val - value to read from the file +File& File::operator>>(std::string& val) +{ + attempt([&]{ + if (IsTextBased()) + val = fgettoken(m_file); + else + val = fgetstring(m_file); + }); + return *this; +} + +// ReadChars - read a specified number of characters, and reset read pointer if requested +// val - [in,out] return value will be returned here +// cnt - number of characters to read +// reset - reset the read pointer +void File::ReadChars(std::string& val, size_t cnt, bool reset) +{ + size_t pos = 0; // (initialize to keep compiler happy) + if (reset) + pos = GetPosition(); + val.resize(cnt); + char *str = const_cast(val.c_str()); + for (int i=0;i < cnt;++i) + *this >> str[i]; + if (reset) + SetPosition(pos); +} + +// ReadChars - read a specified number of characters, and reset read pointer if requested +// val - [in,out] return value will be returned here +// cnt - number of characters to read +// reset - reset the read pointer +void File::ReadChars(std::wstring& val, size_t cnt, bool reset) +{ + size_t pos = 0; // (initialize to keep compiler happy) + if (reset) + pos = GetPosition(); + val.resize(cnt); + wchar_t *str = const_cast(val.c_str()); + for (int i=0;i < cnt;++i) + *this >> str[i]; + if (reset) + SetPosition(pos); +} + +// WriteString - outputs a string into the file +// str - the string to output +// size - size of the string to output, if zero null terminated +void File::WriteString(const char* str, int size) +{ + attempt([&]{ + if (size > 0) + { + fwprintf(m_file, L" %.*hs", size, str); + } + else + { + if (IsTextBased()) + fwprintf(m_file, L" %hs", str); + else + fputstring (m_file, str); + } + }); +} + +// ReadString - reads a string into the file +// str - the string buffer to read the string into +// size - size of the string string buffer +void File::ReadString(char* str, int size) +{ + attempt([&]{ + if (IsTextBased()) + fgettoken(m_file, str, size); + else + fgetstring (m_file, str, size); + }); +} + +// WriteString - outputs a string into the file +// if writing to text based file and spaces are embedded, writes quotes around string +// str - the string to output +// size - size of the string to output, if zero null terminated +void File::WriteString(const wchar_t* str, int size) +{ + attempt([&]{ +#ifdef EMBEDDED_SPACES + // start of implementation of embedded space support with quoting + // not complete, not sure if we need it + bool spacefound = false; + wchar_t quote = 0; + if (IsTextBased()) + { + // search for embedded spaces and quotes + wstring searchString = L" \"'~"; + const wchar_t* result = NULL; + while (result = wcspbrk(str, searchString.c_str())) + { + if (IsWhiteSpace(*result)) + spacefound = true; + searchString.find(*result, 0); + } + } +#endif + if (size > 0) + { + fwprintf(m_file, L" %.*ls", size, str); + } + else + { + if (IsTextBased()) + fwprintf(m_file, L" %ls", str); + else + fputstring (m_file, str); + } + }); +} + +// ReadString - reads a string into the file +// str - the string buffer to read the string into +// size - size of the string string buffer +void File::ReadString(wchar_t* str, int size) +{ + attempt([&]{ + if (IsTextBased()) + fgettoken(m_file, str, size); + else + fgetstring (m_file, str, size); + }); +} + +// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark? +// skip - skip the BOM mark if found (defaults to false) +// returns - true if on a unicode BOM +bool File::IsUnicodeBOM(bool skip) +{ + File& file = *this; + uint64_t pos = GetPosition(); + // if we aren't at the beginning of the file, it can't be the byte order mark + if (pos != 0) + return false; + + // only exists for UNICODE files + bool found = false; + if (m_options & fileOptionsUnicode) + { + unsigned int bom=0; + if (IsTextBased()) + ftrygetText(m_file, bom); + else + fget(m_file, bom); + // future: one reason for the BOM is to detect other-endian files, should we support? + found = (bom == 0xfeff); + } + else if (m_options & fileOptionsText) + { + char val[3]; + file.ReadString(val, 3); + found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF); + } + // restore pointer if no BOM or we aren't skipping it + if (!found || !skip) + { + SetPosition(pos); + } + return found; +} + +//Size - return the size of the file +// WARNING: calling this will reset the EOF marker, so do so with care +size_t File::Size() +{ + return filesize(m_file); +} + +// IsEOF - if we have read past the end of the file +// return - true if end of file has been found +bool File::IsEOF() +{ + return !!feof(m_file); +} + +// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)? +// skip - skip the whitespace if found (defaults to false) +// returns - true if whitespace found +bool File::IsWhiteSpace(bool skip) +{ + bool spaceFound = false; + bool spaceCur = false; + if (m_options & fileOptionsUnicode) + { + wint_t c; + do + { + c = fgetwc (m_file); + if (c == WEOF) // hit the end + return spaceFound; + spaceCur = !!iswspace(c); + spaceFound = spaceFound || spaceCur; + } while (spaceCur && skip); + // put back the last character (WEOF is ignored) + ungetwc(c, m_file); + } + else + { + int c; + do + { + c = fgetc (m_file); + if (c == EOF) // hit the end + return spaceFound; + spaceCur = !!isspace(c); + spaceFound = spaceFound || spaceCur; + } while (spaceCur && skip); + // put back the last character (EOF is ignored) + ungetc(c, m_file); + } + + return spaceFound; +} + +// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too +// skip - skip the end of line if found (defaults to false) +// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped +int File::EndOfLineOrEOF(bool skip) +{ + int found = false; + if (m_options & fileOptionsUnicode) + found = fskipwNewline(m_file,skip); + else if (m_options & fileOptionsText) + found = fskipNewline(m_file, skip); + return found; +} + + +// Get a marker from the file +// some are ignored others are expecting characters +// must use GetMarker methods for those that require parameters +File& File::operator>>(FileMarker marker) +{ + File& file = *this; + + switch(marker) + { + case fileMarkerBeginFile: // beginning of file marker + // check for Unicode BOM marker + if (IsTextBased()) + IsUnicodeBOM(true); + break; + case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file? + if (!IsEOF()) + throw std::runtime_error("fileMarkerEndFile not found"); + break; + case fileMarkerBeginList: // Beginning of list marker + // no marker written unless an list with a count header + break; + case fileMarkerListSeparator: // separate elements of a list + // do nothing for now, built in space deliminter for all types (before type) + // future: make this customizable, so you can specify a separator (i.e. ',') + break; + case fileMarkerEndList: // end of line/list marker + if (IsTextBased()) + { + int found = EndOfLineOrEOF(true); + if (found != (int)true) // EOF can also be returned + throw std::runtime_error("Newline not found"); + } + break; + case fileMarkerBeginSection: // beginning of section + case fileMarkerEndSection: // end of section + assert(false); // sections should use a string modifier + break; + } + return file; +} + +// Get a marker from the file +// some are ignored others are expecting characters +// must use GetMarker methods for those that require parameters +bool File::IsMarker(FileMarker marker, bool skip) +{ + bool retval = false; + switch(marker) + { + case fileMarkerBeginFile: // beginning of file marker + // check for Unicode BOM marker + retval = IsUnicodeBOM(skip); + break; + case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file? + retval = IsEOF(); + break; + case fileMarkerBeginList: // Beginning of list marker + // no marker written unless an list with a count header + // should we try to validate BOL header (just know it's an int, not negative, etc.) + break; + case fileMarkerListSeparator: // separate elements of a list + // do nothing for now, built in space deliminter for all types (before type) + // future: make this customizable, so you can specify a separator (i.e. ',') + break; + case fileMarkerEndList: // end of line/list marker + if (IsTextBased()) + { + int eolSeen = false; + eolSeen = EndOfLineOrEOF(skip); + retval = (eolSeen == (int)true); + } + break; + case fileMarkerBeginSection: // beginning of section + case fileMarkerEndSection: // end of section + // can't destinquish from a string currently + break; + } + return retval; +} + + +// GetMarker for beginning of list support (lists with a count) +// count - [out] returns the number of elements in the list +File& File::GetMarker(FileMarker marker, size_t& count) +{ + assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers + // use text based try, so it can fail without an exception + if (IsTextBased()) + ftrygetText(m_file, count); + else + fget(m_file, count); + return *this; +} + +// GetMarker for section beginning and ending tags +// section - [in]name of section that is expected +File& File::GetMarker(FileMarker marker, const std::string& section) +{ + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + string str; + *this >> str; + if (str != section) + throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section); + return *this; +} + +// GetMarker for section beginning and ending tags +// section - [in]name of section that is expected +File& File::GetMarker(FileMarker marker, const std::wstring& section) +{ + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + wstring str; + *this >> str; + if (str != section) + throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section)); + return *this; +} + +// TryGetMarker for section beginning and ending tags +// section - [in]name of section that is expected +bool File::TryGetMarker(FileMarker marker, const std::wstring& section) +{ + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + size_t pos = GetPosition(); + std::wstring str; + try + { + *this >> str; + if (str == section) + return true; + } + catch(...) + { + //eat + } + SetPosition(pos); + return false; +} + +// TryGetMarker for section beginning and ending tags +// section - [in]name of section that is expected +bool File::TryGetMarker(FileMarker marker, const std::string& section) +{ + // only the section markers take a string parameter + assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker; + size_t pos = GetPosition(); + std::string str; + try + { + *this >> str; + if (str == section) + return true; + } + catch(...) + { + return false; + } + SetPosition(pos); + return false; +} + +// GetPosition - Get position in a file +uint64_t File::GetPosition() +{ + return fgetpos(m_file); +} + +// Set the position in the file +// pos - position in the file +void File::SetPosition(uint64_t pos) +{ + fsetpos (m_file, pos); +} + +}}} diff --git a/Common/Include/TimerUtility.h b/Common/Include/TimerUtility.h new file mode 100644 index 000000000..c964f4282 --- /dev/null +++ b/Common/Include/TimerUtility.h @@ -0,0 +1,13 @@ +#pragma once + +#define MS_PER_SEC 1000 + +namespace Microsoft{namespace MSR {namespace CNTK { + class Timer + { + public: + Timer(){}; + ~Timer(){}; + static unsigned long long MilliSecondElapsed(); + }; +}}} diff --git a/Common/TimerUtility.cpp b/Common/TimerUtility.cpp new file mode 100644 index 000000000..f0fe29342 --- /dev/null +++ b/Common/TimerUtility.cpp @@ -0,0 +1,39 @@ +#include "TimerUtility.h" + +#ifdef WIN32 +#include +#else +#include +#endif +namespace Microsoft{ + namespace MSR { + namespace CNTK { + + //Returns the amount of milliseconds elapsed + unsigned long long Timer::MilliSecondElapsed() + { +#ifdef WIN32 + FILETIME ft; + LARGE_INTEGER li; + + GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+ + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + + unsigned long long ret = li.QuadPart; + ret -= 116444736000000000LL; // Make the values consistent with Linux. + ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3) + + return ret; +#else + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux + + UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000; + + return ret; +#endif + } + } + } +} \ No newline at end of file diff --git a/Common/fileutil.cpp b/Common/fileutil.cpp index 4b2e3c565..9f6b6b134 100644 --- a/Common/fileutil.cpp +++ b/Common/fileutil.cpp @@ -4,7 +4,10 @@ // // -#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#endif + #define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _ #pragma warning (disable: 4996) // ^^ this does not seem to work--TODO: make it work #define _FILE_OFFSET_BITS 64 // to force fseeko() and ftello() 64 bit in Linux diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp index ebb659525..2269d8779 100644 --- a/DataReader/HTKMLFReader/HTKMLFReader.cpp +++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp @@ -49,17 +49,24 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_truncated = readerConfig("Truncated", "false"); m_convertLabelsToTargets = false; - m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1"); + ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1"); + m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs; - if (m_numberOfuttsPerMinibatch < 1) + for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++) { - LogicError("nbrUttsInEachRecurrentIter cannot be less than 1."); + m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i]; + if (m_numberOfuttsPerMinibatch < 1) + { + LogicError("nbrUttsInEachRecurrentIter cannot be less than 1."); + } + + if (!m_truncated && m_numberOfuttsPerMinibatch != 1) + { + LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false."); + } } - if (!m_truncated && m_numberOfuttsPerMinibatch != 1) - { - LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false."); - } + m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0]; m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch; m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true); @@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // get the read method, defaults to "blockRandomize" other option is "rollingWindow" std::string readMethod(readerConfig("readMethod","blockRandomize")); + if (readMethod == "blockRandomize" && randomize == randomizeNone) + { + fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto"); + randomize = randomizeAuto; + } + // see if they want to use readAhead m_readAhead = readerConfig("readAhead", "false"); @@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // now get the frame source. This has better randomization and doesn't create temp files m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode); + m_frameSource->setverbosity(verbosity); //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode); } @@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { { m_mbSize = mbSize; + m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch]; + + m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch; + m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true); + m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0); + m_toProcess.assign(m_numberOfuttsPerMinibatch, 0); + m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0); + if (m_trainOrTest) { StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples); diff --git a/DataReader/HTKMLFReader/HTKMLFReader.h b/DataReader/HTKMLFReader/HTKMLFReader.h index 3b7692f4b..a4e90da3d 100644 --- a/DataReader/HTKMLFReader/HTKMLFReader.h +++ b/DataReader/HTKMLFReader/HTKMLFReader.h @@ -1,3 +1,4 @@ +<<<<<<< HEAD // // // Copyright (c) Microsoft Corporation. All rights reserved. @@ -111,4 +112,117 @@ public: void SetSentenceEnd(int /*actualMbSize*/){}; }; +======= +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples +#pragma once +#include "DataReader.h" +#include "commandArgUtil.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +class HTKMLFReader : public IDataReader +{ +private: + msra::dbn::minibatchiterator* m_mbiter; + msra::dbn::minibatchsource* m_frameSource; + msra::dbn::minibatchreadaheadsource* m_readAheadSource; + msra::dbn::FileEvalSource* m_fileEvalSource; + msra::dbn::latticesource* m_lattices; + map m_latticeMap; + + vector m_sentenceEnd; + bool m_readAhead; + bool m_truncated; + vector m_processedFrame; + intargvector m_numberOfuttsPerMinibatchForAllEpochs; + size_t m_numberOfuttsPerMinibatch; + size_t m_actualnumberOfuttsPerMinibatch; + size_t m_mbSize; + vector m_toProcess; + vector m_switchFrame; + bool m_noData; + + bool m_trainOrTest; // if false, in file writing mode + + std::map m_idToLabelMap; + + bool m_partialMinibatch; // allow partial minibatches? + + std::vector m_featuresBufferMultiUtt; + std::vector m_featuresBufferAllocatedMultiUtt; + std::vector m_labelsBufferMultiUtt; + std::vector m_labelsBufferAllocatedMultiUtt; + std::vector m_featuresStartIndexMultiUtt; + std::vector m_labelsStartIndexMultiUtt; + + std::vector m_featuresBufferMultiIO; + std::vector m_featuresBufferAllocatedMultiIO; + std::vector m_labelsBufferMultiIO; + std::vector m_labelsBufferAllocatedMultiIO; + + std::map m_featureNameToIdMap; + std::map m_labelNameToIdMap; + std::map m_nameToTypeMap; + std::map m_featureNameToDimMap; + std::map m_labelNameToDimMap; + // for writing outputs to files (standard single input/output network) - deprecate eventually + bool m_checkDictionaryKeys; + bool m_convertLabelsToTargets; + std::vector m_convertLabelsToTargetsMultiIO; + std::vector> m_inputFilesMultiIO; + + size_t m_inputFileIndex; + std::vector m_featDims; + std::vector m_labelDims; + + std::vector>>m_labelToTargetMapMultiIO; + + void PrepareForTrainingOrTesting(const ConfigParameters& config); + void PrepareForWriting(const ConfigParameters& config); + + bool GetMinibatchToTrainOrTest(std::map*>&matrices); + bool GetMinibatchToWrite(std::map*>&matrices); + + void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + + bool ReNewBufferForMultiIO(size_t i); + + size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} + void SetNbrSlicesEachRecurrentIter(const size_t) { }; + + void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels); + + + size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap); + enum InputOutputTypes + { + real, + category, + }; + + + +public: + virtual void Init(const ConfigParameters& config); + virtual void Destroy() {delete this;} + virtual ~HTKMLFReader(); + virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + virtual bool GetMinibatch(std::map*>& matrices); + virtual const std::map& GetLabelMapping(const std::wstring& sectionName); + virtual void SetLabelMapping(const std::wstring& sectionName, const std::map& labelMapping); + virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0); + + virtual bool DataEnd(EndDataType endDataType); + void SetSentenceEndInBatch(vector &/*sentenceEnd*/); + void SetSentenceEnd(int /*actualMbSize*/){}; +}; + +>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d }}} \ No newline at end of file diff --git a/DataReader/HTKMLFReader/rollingwindowsource.h b/DataReader/HTKMLFReader/rollingwindowsource.h index a3babcb13..7d5e253cc 100644 --- a/DataReader/HTKMLFReader/rollingwindowsource.h +++ b/DataReader/HTKMLFReader/rollingwindowsource.h @@ -1,817 +1,817 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file -// - -#pragma once - -#include "basetypes.h" // for attempt() -#include "numahelpers.h" // for NUMA allocation -#include "minibatchsourcehelpers.h" -#include "minibatchiterator.h" -#include "biggrowablevectors.h" -#include "ssematrix.h" - -namespace msra { namespace dbn { - - // --------------------------------------------------------------------------- - // biggrowablevectorarray -- a big array of vectors for features, growable (push_back) - // Data is striped across NUMA nodes, as to not clog them up. - // This also supports paging to disk, which is used for the old minibatchframesource. - // --------------------------------------------------------------------------- - class biggrowablevectorarray : public growablevectorbase - { - size_t m; // dim - - size_t inmembegin; // range we have in memory, rounded to enclosing blocks (not rounded at end) - size_t inmemend; - - wstring pagepath; // path for paging, empty if no paging - auto_file_ptr f; // file handle for paging - bool reading; // have we begun reading? - - // allocate a block - msra::dbn::matrix * newblock() const - { - // we stripe the data across NUMA nodes as to not fill up one node with the feature data - msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode()); - msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock); - msra::numa::overridenode (-1); // note: we really should reset it also in case of failure - return res; - } - - // handling of page file - bool paging() const { return !pagepath.empty(); } - void openpagefile (bool wantread) - { - if (!paging()) return; - msra::files::make_intermediate_dirs (pagepath); - - if (!wantread) - { - FILE *ftry = NULL; - wstring pathname (pagepath); - ftry = _wfopen (pathname.c_str(), L"wbS"); - if (ftry) fclose (ftry); - } - - /* - code below to cycle through a-z appended to file name is no longer necessary - since caller guarantees unique file names via HTKMLFReader - and we want the pagepath logged to the user to be the actual one used by the code - - // try to open the pagepath from a to z - if (!wantread) - { - FILE *ftry = NULL; - char trynum = 'a'; - while (!ftry && trynum <= 'z') - { - wstring pathname (pagepath); - pathname += trynum++; - ftry = _wfopen (pathname.c_str(), L"wbS"); - } - if (ftry) fclose (ftry); - pagepath += --trynum; - } - */ - f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS"); - reading = wantread; - } - void flushlastblock() // during population phase, must be called once per block in sequence - { - if (!paging()) return; - assert (!reading); - if (blocks.empty()) return; - const size_t blockid = blocks.size() -1; - msra::dbn::matrix & block = *blocks[blockid]; - assert (fgetpos (f) == blockid * block.sizeinpagefile()); - block.topagefile (f); - blocks[blockid].reset(); // free the memory - assert (blockid * elementsperblock == inmembegin); - inmembegin = inmemend; // empty range - } - void releaseblock (size_t t0) // t0=block start time - { - assert (paging() && reading); - size_t blockid = t0 / elementsperblock; - assert (blockid * elementsperblock == t0); - assert (blocks[blockid]); - fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1); - blocks[blockid].reset(); // free the memory - } - void recoverblock (size_t t0) // t0=block start time - { - assert (paging() && reading); - size_t blockid = t0 / elementsperblock; - assert (blockid * elementsperblock == t0); - assert (!blocks[blockid]); - fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1); - blocks[blockid].reset (newblock()); - msra::dbn::matrix & block = *blocks[blockid]; - fsetpos (f, blockid * block.sizeinpagefile()); - block.frompagefile (f); - } - - public: - biggrowablevectorarray (const wstring & pagepath) - : growablevectorbase (65536), m (0), - inmembegin (0), inmemend (0), pagepath (pagepath), reading (false) - { - openpagefile (false); - if (paging()) - fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str()); - } - ~biggrowablevectorarray() { // clean up the big temp file - if (paging()) { - fclose (f); - if (_wunlink (pagepath.c_str())==0) - fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str()); - else - fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str()); - } - } - - size_t dim() const { return m; } // dimension of a frame - - // reading phase - void push_back (const std::vector & in) - { - assert (!in.empty()); - assert (m == 0 || m == in.size()); - m = in.size(); - const size_t blockid = n / elementsperblock; - assert (blockid <= blocks.size()); - if (blockid == blocks.size()) // a new block is needed - { - flushlastblock(); - blocks.push_back (std::unique_ptr (newblock())); - } - const size_t blockn = n % elementsperblock; - msra::dbn::matrix & block = *blocks[blockid].get(); - foreach_index (k, in) - block(k,blockn) = in[k]; - n++; - inmemend = n; - } - void no_more_push_back() // done pushing --switch to consumption mode - { - if (!paging()) return; - // finish off last block - flushlastblock(); - fflushOrDie (f); - fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f)); - fclose (f); - foreach_index (i, blocks) assert (!blocks[i]); // ensure we flushed - assert (inmembegin == inmemend); // nothing in cache - // switch to reading mode - openpagefile (true); - } - - // access phase - // Returns 'true' if data was actually read from disk. - bool require (pair bounds) // we require this range of frames - { - bool readfromdisk = false; - - // get bounds rounded to block boundaries - const size_t ts = bounds.first / elementsperblock * elementsperblock; - const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock); - assert (paging()); - // free all the memmory - for (size_t t = inmembegin; t < inmemend; t += elementsperblock) - { - if (t >= ts && t < te) // if in wanted range then skip to end of it - t = te - elementsperblock; - else - releaseblock (t); - } - // page in all required blocks - for (size_t t = ts; t < te; t += elementsperblock) - { - if (t >= inmembegin && t < inmemend) // if in memory already then skip to end of it - t = inmemend - elementsperblock; - else - { - recoverblock (t); - readfromdisk = true; // tell caller we did something expensive - } - } - // got it - inmembegin = ts; - inmemend = te; - return readfromdisk; - } - const msra::dbn::matrixstripe operator[] (size_t t) const // get a feature vector - { - if (t < inmembegin || t >= inmemend) - throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first"); - const size_t blockt = getblockt (t); - /*const*/ msra::dbn::matrix & block = getblock (t); - return msra::dbn::matrixstripe (block, blockt, 1); - } - wstring pagepathname(){ return pagepath;} - void cleanuppagefile() - { - if (paging()) { - fclose (f); - if (_wunlink (pagepath.c_str())==0){ - fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str()); - } - else{ - fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str()); - } - } - } - }; - - // --------------------------------------------------------------------------- - // minibatchframesource -- feature source to provide randomized frames in minibatches - // This is the old code that pages all frames to a huge disk file first. - // (The new minibatchutterancesource pages from input files directly and can also - // operate in utterance mode for MMI training.) - // --------------------------------------------------------------------------- - class minibatchframesource : public minibatchsource - { - size_t vdim; // feature dimension after augmenting neighhors (0: don't read features) - unsigned int sampperiod; // (for reference and to check against model) - string featkind; - size_t featdim; - // cache - biggrowablevectorarray frames; // [t][i] all features concatenated - std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors()) - std::vector classids; // [t] the state that the frame belongs to - size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels - msra::dbn::randomordering randomordering; // [t] -> t' - double timegetbatch; - int verbosity; - public: - // constructor - // Pass empty labels to denote unsupervised training (so getbatch() will not return uids). - minibatchframesource (const std::vector & infiles, const map> & labels, - size_t vdim, size_t udim, size_t randomizationrange, const wstring & pagepath, const bool mayhavenoframe=false, int addEnergy=0) - : vdim (vdim), sampperiod (0), featdim (0), numframes (0), frames (pagepath), timegetbatch (0), verbosity(2) - { - if (vdim == 0 && labels.empty()) - throw runtime_error ("minibatchframesource: when running without features, labels are needed"); - // at this stage, we simply page in the entire training set at once and work off RAM - // We will benefit from feature archives indirectly through htkfeatio. - // TODO: - // - infiles must specify time range - // - at this stage only reserve() (we know the time range; allocate second-layer structure) - // - implement block-wise paging directly from HTK feature files through htkfeatreader - featkind.clear(); - std::vector frame; - fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size()); - size_t numclasses = 0; // number of units found (actually max id +1) - size_t notfound = 0; // number of entries missing in MLF - msra::asr::htkfeatreader reader; // feature reader - reader.AddEnergy(addEnergy); - - foreach_index (i, infiles) - { - if (i % (infiles.size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); } - msra::basetypes::matrix feat; - msra::asr::htkfeatreader::parsedpath ppath (infiles[i]); - - // skip files for which labels don't exist (assuming bad alignment) - wstring key; - if (!labels.empty()) // empty means unsupervised mode (don't load any) - { - key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) - if (labels.find (key) == labels.end()) - { - if (notfound < 5) - fprintf (stderr, "\nminibatchframesource: %d-th file not found in MLF label set: %S", i, key.c_str()); - notfound++; - continue; // skip this utterance at all - } - } - - // get feature frames - if (vdim != 0) // (vdim == special mode to not read features at all) - { - msra::util::attempt (5, [&]() - { - reader.read (ppath, featkind, sampperiod, feat); // whole file read as columns of feature vectors - }); - if (featdim == 0) // first time - featdim = feat.rows(); - else if (featdim != feat.rows()) - throw std::runtime_error ("minibatchframesource: inconsistent feature dimension across files"); - // HVite occasionally generates mismatching output --skip such files - if (!key.empty()) // (we have a key if supervised mode) - { - const auto & labseq = labels.find (key)->second; // (we already checked above that it exists) - size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes); - if (abs ((int) labframes - (int) feat.cols()) > 0) - { - fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str()); - notfound++; - continue; // skip this utterance at all - } - } - // append to cache - frame.resize (featdim); - if (feat.cols() < 2) // (2 frames needed for boundary markers) - throw std::runtime_error ("minibatchframesource: utterances < 2 frames not supported"); - foreach_column (t, feat) - { - foreach_index (k, frame) - frame[k] = feat(k,t); - frames.push_back (frame); - numframes++; - boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0); - } - assert (numframes == frames.size()); - assert (numframes == boundaryflags.size()); - } - - // get label sequence - if (!key.empty()) // (we have a key if supervised mode) - { - const auto & labseq = labels.find (key)->second; // (we already checked above that it exists) - foreach_index (i, labseq) - { - const auto & e = labseq[i]; - if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0)) - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: labels not in consecutive order MLF in label set: %S", key.c_str())); - for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++) - { - if (e.classid >= udim) - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: class id exceeds model dimension in file %S", key.c_str())); - if (e.classid != (CLASSIDTYPE) e.classid) - throw std::runtime_error ("CLASSIDTYPE has too few bits"); - classids.push_back ((CLASSIDTYPE) e.classid); - numclasses = max (numclasses, 1u + e.classid); - } - } - if (vdim == 0) - numframes = classids.size(); - if (numframes != classids.size()) // TODO: remove this once we are confident - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str())); - assert (numframes == classids.size()); - } - else - { - assert (classids.empty()); // that's how we detect it later - } - } - assert (vdim == 0 || numframes == frames.size()); - assert (labels.empty() || numframes == classids.size()); - if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size())) - throw std::runtime_error ("minibatchframesource: numframes variable screwup"); - fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses); - if (notfound > 0) - { - fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size()); - if (notfound > infiles.size() / 2) - throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n"); - } - - if (numframes == 0 && !mayhavenoframe) - throw std::runtime_error ("minibatchframesource: no input features given!"); - - // notify frames source to switch from population to consumption mode - frames.no_more_push_back(); - - // initialize randomizer - if (numframes > 0) - randomordering.resize (numframes, randomizationrange); - } - virtual ~minibatchframesource() {} - size_t totalframes() const { assert (vdim == 0 || numframes == frames.size()); assert (!issupervised() || numframes == classids.size()); return numframes; } - - bool issupervised() const { return !classids.empty(); } - - void setverbosity(int newverbosity) { verbosity = newverbosity; } - - // retrieve one minibatch - // Minibatches are deterministic pseudo-random samples. The entire corpus - // is repeated infinitely, but each repetition (a 'sweep') is randomized - // differently. - // This function allows to retrieve a mini-batch starting from any frame - // within this infinitely extended repetition. To the end, mini-batches are - // specified by start frame and #frames. - // This function returns the same data independent on #frames, i.e. the concept - // of the mini-batch is not defined in here, but on the caller side. The caller - // can retrieve the frames of a mini-batch in chunks that do not match the - // caller's definition of "mini-batch," e.g. bigger or smaller chunks. - // If a requested mini-batch spans a sweep boundary, then this function will - // not return samples after the sweep boundary. Instead, the returned frame - // set is shortened to not exceed the end of the sweep. The caller must make - // a separate second call to get the rest. In trainlayer(), the one - // sweep-boundary-spanning mini-batch will simply be shortened. - // This function is NOT thread-safe (due to caching of random sequence). - bool getbatch (const size_t globalts, const size_t framesrequested, msra::dbn::matrix & feat, std::vector & uids, - std::vector> & transcripts, - std::vector> & latticepairs) - { - auto_timer timergetbatch; - - transcripts.clear(); // word-level transcripts not supported by frame source (aimed at MMI) - latticepairs.clear(); // neither are lattices - - assert (totalframes() > 0); - const size_t sweep = globalts / totalframes(); // which sweep (this determines randomization) - const size_t ts = globalts % totalframes(); // start frame within the sweep - const size_t te = min (ts + framesrequested, totalframes()); // do not go beyond sweep boundary - assert (te > ts); - if (verbosity >= 2) - fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep); - - // get random sequence (each time index occurs exactly once) - // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes. - const auto & tmap = randomordering (sweep); - - // page in the needed range of frames - const size_t extent = augmentationextent (frames.dim(), vdim); - bool readfromdisk = frames.require (randomordering.bounds (max (ts, extent) - extent, te + 1 + extent)); - - // generate features and uids - feat.resize (vdim, te - ts); // note: special mode vdim == 0 means no features to be loaded - if (issupervised()) // empty means unsupervised training -> return empty uids - uids.resize (te - ts); - else - uids.clear(); - for (size_t t = ts; t < te; t++) - { - size_t trand = tmap[t]; // the random-sequence sample point for this point in time - if (vdim != 0) - { - auto v_t = feat.col(t-ts); // the vector to fill in - augmentneighbors (frames, boundaryflags, trand, v_t); - } - if (issupervised()) - uids[t-ts] = classids[trand]; - } - timegetbatch = timergetbatch; - return readfromdisk; - } - - bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids, - std::vector> & transcripts, - std::vector> & latticepairs) - { - // for single input/output set size to be 1 and run old getbatch - feat.resize(1); - uids.resize(1); - //transcripts.resize(1); - //latticepairs.resize(1); - return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs); - } - - double gettimegetbatch () { return timegetbatch;} - - // return first valid globalts to ask getbatch() for - // In frame mode, there is no constraint, i.e. it is 'globalts' itself. - /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; } - - /*implement*/ const std::vector & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); static std::vector x; return x;/*keep compiler happy*/ } - }; - - // --------------------------------------------------------------------------- - // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches - // this is derived from minibatchframesource but worked with multiple inputs and/or outputs - // by making "frames" and "classids" a vector of vectors - // --------------------------------------------------------------------------- - class minibatchframesourcemulti : public minibatchsource - { - std::vector vdim; // feature dimension after augmenting neighhors (0: don't read features) - std::vector leftcontext; // number of frames to the left of the target frame in the context window - std::vector rightcontext; // number of frames to the right of the target frame in the context window - unsigned int sampperiod; // (for reference and to check against model) - string featkind; - size_t featdim; - size_t maxvdim; - // cache - //std::vector frames; - std::vector> pframes; // [t][i] all features concatenated - std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors()) - std::vector> classids; // [t] the state that the frame belongs to - size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels - msra::dbn::randomordering randomordering; // [t] -> t' - double timegetbatch; - int verbosity; - - public: - // constructor - // Pass empty labels to denote unsupervised training (so getbatch() will not return uids). - minibatchframesourcemulti (const std::vector> & infiles, const std::vector>> & labels, - std::vector vdim, std::vector udim, std::vector leftcontext, std::vector rightcontext, size_t randomizationrange, const std::vector & pagepath, const bool mayhavenoframe=false, int addEnergy=0) - : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), numframes (0), timegetbatch (0), verbosity(2), maxvdim(0) - { - - if (vdim[0] == 0 && labels.empty()) - throw runtime_error ("minibatchframesourcemulti: when running without features, labels are needed"); - // at this stage, we simply page in the entire training set at once and work off RAM - // We will benefit from feature archives indirectly through htkfeatio. - // TODO: - // - infiles must specify time range - // - at this stage only reserve() (we know the time range; allocate second-layer structure) - // - implement block-wise paging directly from HTK feature files through htkfeatreader - featkind.clear(); - std::vector frame; - std::vectornumclasses; // number of units found (actually max id +1) - size_t notfound = 0; // number of entries missing in MLF - - - std::vectorframesaccum; - - if (infiles.size()==0) - throw runtime_error("minibatchframesourcemulti: need at least one network input specified with features"); - - if (labels.size()==0) - fprintf(stderr,"no MLF label files detected\n"); - - foreach_index (i, infiles) - { - pframes.push_back(unique_ptr(new biggrowablevectorarray(pagepath[i]))); - - if (vdim[i]>maxvdim) - maxvdim=vdim[i]; - } - - - foreach_index (i, labels) - { - classids.push_back(std::vector()); - numclasses.push_back(0); - } - - - fprintf (stderr, "minibatchframesourcemulti: reading %d feature sets and %d label sets...", infiles.size(),labels.size()); - - foreach_index (m, infiles) - { - - - featdim=0; - numframes=0; - featkind.clear(); - msra::asr::htkfeatreader reader; // feature reader - reader.AddEnergy(addEnergy); - - foreach_index (i, infiles[m]) // read each feature file in set m - { - if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); } - msra::basetypes::matrix feat; - msra::asr::htkfeatreader::parsedpath ppath (infiles[m][i]); - - // skip files for which labels don't exist (assuming bad alignment) - wstring key; - if (!labels.empty()) - { - if (!labels[0].empty()) // empty means unsupervised mode (don't load any) - { - key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) - if (labels[0].find (key) == labels[0].end()) - { - if (notfound < 5) - fprintf (stderr, "\nminibatchframesourcemulti: %d-th file not found in MLF label set: %S", i, key.c_str()); - notfound++; - continue; // skip this utterance at all - } - } - } - // get feature frames - if (vdim[m] != 0) // (vdim == special mode to not read features at all) - { - msra::util::attempt (5, [&]() - { - reader.read (ppath, featkind, sampperiod, feat); // whole file read as columns of feature vectors - }); - if (featdim == 0) // first time - featdim = feat.rows(); - else if (featdim != feat.rows()) - throw std::runtime_error ("minibatchframesourcemulti: inconsistent feature dimension across files"); - // HVite occasionally generates mismatching output --skip such files - if (!key.empty()) // (we have a key if supervised mode) - { - const auto & labseq = labels[0].find (key)->second; // (we already checked above that it exists) - size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes); - if (abs ((int) labframes - (int) feat.cols()) > 0) - { - fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str()); - notfound++; - continue; // skip this utterance at all - } - } - // append to cache - frame.resize (featdim); - if (feat.cols() < 2) // (2 frames needed for boundary markers) - throw std::runtime_error ("minibatchframesourcemulti: utterances < 2 frames not supported"); - foreach_column (t, feat) - { - foreach_index (k, frame) - frame[k] = feat(k,t); - - pframes[m]->push_back (frame); - numframes++; - if (m==0) - boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0); - } - if (m==0) - framesaccum.push_back(numframes); - else - assert(numframes == framesaccum[i]); - - assert (numframes == pframes[m]->size()); - } - if (m==0) - assert (numframes == boundaryflags.size()); - - - - if (m==0) // after we get the key for this file, read all labels (only done for first feature) - { - if (!key.empty()) - { - foreach_index (j, labels) - { - const auto & labseq = labels[j].find (key)->second; // (we already checked above that it exists) - foreach_index (i, labseq) - { - const auto & e = labseq[i]; - if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0)) - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: labels not in consecutive order MLF in label set: %S", key.c_str())); - for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++) - { - if (e.classid >= udim[j]) - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: class id exceeds model dimension in file %S", key.c_str())); - if (e.classid != (CLASSIDTYPE) e.classid) - throw std::runtime_error ("CLASSIDTYPE has too few bits"); - classids[j].push_back ((CLASSIDTYPE) e.classid); - numclasses[j] = max (numclasses[j], 1u + e.classid); - } - } - if (vdim[m] == 0) - numframes = classids[j].size(); - if (numframes != classids[j].size()) // TODO: remove this once we are confident - throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: label duration inconsistent with feature file in MLF label set: %S", key.c_str())); - assert (numframes == classids[j].size()); - - } - } - else - { - assert(classids.empty()); - } - - } - - } - - - assert (vdim[m] == 0 || numframes == pframes[m]->size()); - - foreach_index(j, labels) - assert (labels[j].empty() || numframes == classids[j].size()); - - if (vdim[m] != 0 && numframes != pframes[m]->size()) // || (!labels.empty() && numframes != classids.size())) - throw std::runtime_error ("\nminibatchframesource: numframes variable screwup"); - if (m==0) - { - foreach_index (j, numclasses) - fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %d classes\n", j, numclasses[j]); - } - fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %d frames read from %d utterances\n", m, pframes[m]->size(), infiles[m].size()); - if (notfound > 0) - { - fprintf (stderr, "minibatchframesourcemulti: %d files out of %d not found in label set\n", notfound, infiles[m].size()); - if (notfound > infiles[m].size() / 2) - throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n"); - } - // notify frames source to switch from population to consumption mode - pframes[m]->no_more_push_back(); - - } - - if (numframes == 0 && !mayhavenoframe) - throw std::runtime_error ("minibatchframesource: no input features given!"); - - - // initialize randomizer - if (numframes > 0) - randomordering.resize (numframes, randomizationrange); - - } - virtual ~minibatchframesourcemulti() {} - size_t totalframes() const { - assert (maxvdim == 0 || numframes == pframes[0]->size()); assert (!issupervised() || numframes == classids[0].size()); return numframes; } - - bool issupervised() const { return !classids.empty(); } - - void setverbosity(int newverbosity) { verbosity = newverbosity; } - - // retrieve one minibatch - // Minibatches are deterministic pseudo-random samples. The entire corpus - // is repeated infinitely, but each repetition (a 'sweep') is randomized - // differently. - // This function allows to retrieve a mini-batch starting from any frame - // within this infinitely extended repetition. To the end, mini-batches are - // specified by start frame and #frames. - // This function returns the same data independent on #frames, i.e. the concept - // of the mini-batch is not defined in here, but on the caller side. The caller - // can retrieve the frames of a mini-batch in chunks that do not match the - // caller's definition of "mini-batch," e.g. bigger or smaller chunks. - // If a requested mini-batch spans a sweep boundary, then this function will - // not return samples after the sweep boundary. Instead, the returned frame - // set is shortened to not exceed the end of the sweep. The caller must make - // a separate second call to get the rest. In trainlayer(), the one - // sweep-boundary-spanning mini-batch will simply be shortened. - // This function is NOT thread-safe (due to caching of random sequence). - bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids, - std::vector> & transcripts, - std::vector> & latticepairs) - { - - auto_timer timergetbatch; - bool readfromdisk; - size_t nreadfromdisk=0; - transcripts.clear(); // word-level transcripts not supported by frame source (aimed at MMI) - latticepairs.clear(); // neither are lattices - - assert (totalframes() > 0); - const size_t sweep = globalts / totalframes(); // which sweep (this determines randomization) - const size_t ts = globalts % totalframes(); // start frame within the sweep - const size_t te = min (ts + framesrequested, totalframes()); // do not go beyond sweep boundary - assert (te > ts); - if (verbosity >= 2) - fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep); - - // get random sequence (each time index occurs exactly once) - // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes. - const auto & tmap = randomordering (sweep); - - feat.resize(pframes.size()); - uids.resize(classids.size()); - foreach_index(i, feat) - { - size_t leftextent, rightextent; - // page in the needed range of frames - if (leftcontext[i] == 0 && rightcontext[i] == 0) - { - leftextent = rightextent = augmentationextent(pframes[i]->dim(), vdim[i]); - } - else - { - leftextent = leftcontext[i]; - rightextent = rightcontext[i]; - } - readfromdisk = pframes[i]->require (randomordering.bounds (max (ts, leftextent) - leftextent, te + 1 + rightextent)); - // generate features and uids - feat[i].resize (vdim[i], te - ts); // note: special mode vdim == 0 means no features to be loaded - if (issupervised()) // empty means unsupervised training -> return empty uids - foreach_index(j, uids) - uids[j].resize (te - ts); - else - uids.clear(); - - for (size_t t = ts; t < te; t++) - { - size_t trand = tmap[t]; // the random-sequence sample point for this point in time - if (vdim[i] != 0) - { - auto v_t = feat[i].col(t-ts); // the vector to fill in - augmentneighbors (*pframes[i], boundaryflags, trand, leftextent, rightextent, v_t); - } - if (i==0){ // read labels for all outputs on first pass thru features. this guarantees they will be read if only one feature set but > 1 label set - if (issupervised()) - foreach_index(j, uids) - uids[j][t-ts] = classids[j][trand]; - } - } - timegetbatch = timergetbatch; - if (readfromdisk) - nreadfromdisk++; - - } - - (nreadfromdisk==feat.size()) ? readfromdisk = true : readfromdisk = false; - - return readfromdisk; - - } - - bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector & /*uids*/, - std::vector> & /*transcripts*/, - std::vector> & /*latticepairs*/) - { - // should never get here - throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchframesource instead\n"); - } - - double gettimegetbatch () { return timegetbatch;} - - // return first valid globalts to ask getbatch() for - // In frame mode, there is no constraint, i.e. it is 'globalts' itself. - /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; } - - /*implement*/ const std::vector & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); } - - }; -};}; \ No newline at end of file +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file +// + +#pragma once + +#include "basetypes.h" // for attempt() +#include "numahelpers.h" // for NUMA allocation +#include "minibatchsourcehelpers.h" +#include "minibatchiterator.h" +#include "biggrowablevectors.h" +#include "ssematrix.h" + +namespace msra { namespace dbn { + + // --------------------------------------------------------------------------- + // biggrowablevectorarray -- a big array of vectors for features, growable (push_back) + // Data is striped across NUMA nodes, as to not clog them up. + // This also supports paging to disk, which is used for the old minibatchframesource. + // --------------------------------------------------------------------------- + class biggrowablevectorarray : public growablevectorbase + { + size_t m; // dim + + size_t inmembegin; // range we have in memory, rounded to enclosing blocks (not rounded at end) + size_t inmemend; + + wstring pagepath; // path for paging, empty if no paging + auto_file_ptr f; // file handle for paging + bool reading; // have we begun reading? + + // allocate a block + msra::dbn::matrix * newblock() const + { + // we stripe the data across NUMA nodes as to not fill up one node with the feature data + msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode()); + msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock); + msra::numa::overridenode (-1); // note: we really should reset it also in case of failure + return res; + } + + // handling of page file + bool paging() const { return !pagepath.empty(); } + void openpagefile (bool wantread) + { + if (!paging()) return; + msra::files::make_intermediate_dirs (pagepath); + + if (!wantread) + { + FILE *ftry = NULL; + wstring pathname (pagepath); + ftry = _wfopen (pathname.c_str(), L"wbS"); + if (ftry) fclose (ftry); + } + + /* + code below to cycle through a-z appended to file name is no longer necessary + since caller guarantees unique file names via HTKMLFReader + and we want the pagepath logged to the user to be the actual one used by the code + + // try to open the pagepath from a to z + if (!wantread) + { + FILE *ftry = NULL; + char trynum = 'a'; + while (!ftry && trynum <= 'z') + { + wstring pathname (pagepath); + pathname += trynum++; + ftry = _wfopen (pathname.c_str(), L"wbS"); + } + if (ftry) fclose (ftry); + pagepath += --trynum; + } + */ + f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS"); + reading = wantread; + } + void flushlastblock() // during population phase, must be called once per block in sequence + { + if (!paging()) return; + assert (!reading); + if (blocks.empty()) return; + const size_t blockid = blocks.size() -1; + msra::dbn::matrix & block = *blocks[blockid]; + assert (fgetpos (f) == blockid * block.sizeinpagefile()); + block.topagefile (f); + blocks[blockid].reset(); // free the memory + assert (blockid * elementsperblock == inmembegin); + inmembegin = inmemend; // empty range + } + void releaseblock (size_t t0) // t0=block start time + { + assert (paging() && reading); + size_t blockid = t0 / elementsperblock; + assert (blockid * elementsperblock == t0); + assert (blocks[blockid]); + fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1); + blocks[blockid].reset(); // free the memory + } + void recoverblock (size_t t0) // t0=block start time + { + assert (paging() && reading); + size_t blockid = t0 / elementsperblock; + assert (blockid * elementsperblock == t0); + assert (!blocks[blockid]); + fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1); + blocks[blockid].reset (newblock()); + msra::dbn::matrix & block = *blocks[blockid]; + fsetpos (f, blockid * block.sizeinpagefile()); + block.frompagefile (f); + } + + public: + biggrowablevectorarray (const wstring & pagepath) + : growablevectorbase (65536), m (0), + inmembegin (0), inmemend (0), pagepath (pagepath), reading (false) + { + openpagefile (false); + if (paging()) + fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str()); + } + ~biggrowablevectorarray() { // clean up the big temp file + if (paging()) { + fclose (f); + if (_wunlink (pagepath.c_str())==0) + fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str()); + else + fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str()); + } + } + + size_t dim() const { return m; } // dimension of a frame + + // reading phase + void push_back (const std::vector & in) + { + assert (!in.empty()); + assert (m == 0 || m == in.size()); + m = in.size(); + const size_t blockid = n / elementsperblock; + assert (blockid <= blocks.size()); + if (blockid == blocks.size()) // a new block is needed + { + flushlastblock(); + blocks.push_back (std::unique_ptr (newblock())); + } + const size_t blockn = n % elementsperblock; + msra::dbn::matrix & block = *blocks[blockid].get(); + foreach_index (k, in) + block(k,blockn) = in[k]; + n++; + inmemend = n; + } + void no_more_push_back() // done pushing --switch to consumption mode + { + if (!paging()) return; + // finish off last block + flushlastblock(); + fflushOrDie (f); + fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f)); + fclose (f); + foreach_index (i, blocks) assert (!blocks[i]); // ensure we flushed + assert (inmembegin == inmemend); // nothing in cache + // switch to reading mode + openpagefile (true); + } + + // access phase + // Returns 'true' if data was actually read from disk. + bool require (pair bounds) // we require this range of frames + { + bool readfromdisk = false; + + // get bounds rounded to block boundaries + const size_t ts = bounds.first / elementsperblock * elementsperblock; + const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock); + assert (paging()); + // free all the memmory + for (size_t t = inmembegin; t < inmemend; t += elementsperblock) + { + if (t >= ts && t < te) // if in wanted range then skip to end of it + t = te - elementsperblock; + else + releaseblock (t); + } + // page in all required blocks + for (size_t t = ts; t < te; t += elementsperblock) + { + if (t >= inmembegin && t < inmemend) // if in memory already then skip to end of it + t = inmemend - elementsperblock; + else + { + recoverblock (t); + readfromdisk = true; // tell caller we did something expensive + } + } + // got it + inmembegin = ts; + inmemend = te; + return readfromdisk; + } + const msra::dbn::matrixstripe operator[] (size_t t) const // get a feature vector + { + if (t < inmembegin || t >= inmemend) + throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first"); + const size_t blockt = getblockt (t); + /*const*/ msra::dbn::matrix & block = getblock (t); + return msra::dbn::matrixstripe (block, blockt, 1); + } + wstring pagepathname(){ return pagepath;} + void cleanuppagefile() + { + if (paging()) { + fclose (f); + if (_wunlink (pagepath.c_str())==0){ + fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str()); + } + else{ + fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str()); + } + } + } + }; + + // --------------------------------------------------------------------------- + // minibatchframesource -- feature source to provide randomized frames in minibatches + // This is the old code that pages all frames to a huge disk file first. + // (The new minibatchutterancesource pages from input files directly and can also + // operate in utterance mode for MMI training.) + // --------------------------------------------------------------------------- + class minibatchframesource : public minibatchsource + { + size_t vdim; // feature dimension after augmenting neighhors (0: don't read features) + unsigned int sampperiod; // (for reference and to check against model) + string featkind; + size_t featdim; + // cache + biggrowablevectorarray frames; // [t][i] all features concatenated + std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors()) + std::vector classids; // [t] the state that the frame belongs to + size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels + msra::dbn::randomordering randomordering; // [t] -> t' + double timegetbatch; + int verbosity; + public: + // constructor + // Pass empty labels to denote unsupervised training (so getbatch() will not return uids). + minibatchframesource (const std::vector & infiles, const map> & labels, + size_t vdim, size_t udim, size_t randomizationrange, const wstring & pagepath, const bool mayhavenoframe=false, int addEnergy=0) + : vdim (vdim), sampperiod (0), featdim (0), numframes (0), frames (pagepath), timegetbatch (0), verbosity(2) + { + if (vdim == 0 && labels.empty()) + throw runtime_error ("minibatchframesource: when running without features, labels are needed"); + // at this stage, we simply page in the entire training set at once and work off RAM + // We will benefit from feature archives indirectly through htkfeatio. + // TODO: + // - infiles must specify time range + // - at this stage only reserve() (we know the time range; allocate second-layer structure) + // - implement block-wise paging directly from HTK feature files through htkfeatreader + featkind.clear(); + std::vector frame; + fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size()); + size_t numclasses = 0; // number of units found (actually max id +1) + size_t notfound = 0; // number of entries missing in MLF + msra::asr::htkfeatreader reader; // feature reader + reader.AddEnergy(addEnergy); + + foreach_index (i, infiles) + { + if (i % (infiles.size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); } + msra::basetypes::matrix feat; + msra::asr::htkfeatreader::parsedpath ppath (infiles[i]); + + // skip files for which labels don't exist (assuming bad alignment) + wstring key; + if (!labels.empty()) // empty means unsupervised mode (don't load any) + { + key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) + if (labels.find (key) == labels.end()) + { + if (notfound < 5) + fprintf (stderr, "\nminibatchframesource: %d-th file not found in MLF label set: %S", i, key.c_str()); + notfound++; + continue; // skip this utterance at all + } + } + + // get feature frames + if (vdim != 0) // (vdim == special mode to not read features at all) + { + msra::util::attempt (5, [&]() + { + reader.read (ppath, featkind, sampperiod, feat); // whole file read as columns of feature vectors + }); + if (featdim == 0) // first time + featdim = feat.rows(); + else if (featdim != feat.rows()) + throw std::runtime_error ("minibatchframesource: inconsistent feature dimension across files"); + // HVite occasionally generates mismatching output --skip such files + if (!key.empty()) // (we have a key if supervised mode) + { + const auto & labseq = labels.find (key)->second; // (we already checked above that it exists) + size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes); + if (abs ((int) labframes - (int) feat.cols()) > 0) + { + fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str()); + notfound++; + continue; // skip this utterance at all + } + } + // append to cache + frame.resize (featdim); + if (feat.cols() < 2) // (2 frames needed for boundary markers) + throw std::runtime_error ("minibatchframesource: utterances < 2 frames not supported"); + foreach_column (t, feat) + { + foreach_index (k, frame) + frame[k] = feat(k,t); + frames.push_back (frame); + numframes++; + boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0); + } + assert (numframes == frames.size()); + assert (numframes == boundaryflags.size()); + } + + // get label sequence + if (!key.empty()) // (we have a key if supervised mode) + { + const auto & labseq = labels.find (key)->second; // (we already checked above that it exists) + foreach_index (i, labseq) + { + const auto & e = labseq[i]; + if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0)) + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: labels not in consecutive order MLF in label set: %S", key.c_str())); + for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++) + { + if (e.classid >= udim) + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: class id exceeds model dimension in file %S", key.c_str())); + if (e.classid != (CLASSIDTYPE) e.classid) + throw std::runtime_error ("CLASSIDTYPE has too few bits"); + classids.push_back ((CLASSIDTYPE) e.classid); + numclasses = max (numclasses, 1u + e.classid); + } + } + if (vdim == 0) + numframes = classids.size(); + if (numframes != classids.size()) // TODO: remove this once we are confident + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str())); + assert (numframes == classids.size()); + } + else + { + assert (classids.empty()); // that's how we detect it later + } + } + assert (vdim == 0 || numframes == frames.size()); + assert (labels.empty() || numframes == classids.size()); + if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size())) + throw std::runtime_error ("minibatchframesource: numframes variable screwup"); + fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses); + if (notfound > 0) + { + fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size()); + if (notfound > infiles.size() / 2) + throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n"); + } + + if (numframes == 0 && !mayhavenoframe) + throw std::runtime_error ("minibatchframesource: no input features given!"); + + // notify frames source to switch from population to consumption mode + frames.no_more_push_back(); + + // initialize randomizer + if (numframes > 0) + randomordering.resize (numframes, randomizationrange); + } + virtual ~minibatchframesource() {} + size_t totalframes() const { assert (vdim == 0 || numframes == frames.size()); assert (!issupervised() || numframes == classids.size()); return numframes; } + + bool issupervised() const { return !classids.empty(); } + + void setverbosity(int newverbosity) { verbosity = newverbosity; } + + // retrieve one minibatch + // Minibatches are deterministic pseudo-random samples. The entire corpus + // is repeated infinitely, but each repetition (a 'sweep') is randomized + // differently. + // This function allows to retrieve a mini-batch starting from any frame + // within this infinitely extended repetition. To the end, mini-batches are + // specified by start frame and #frames. + // This function returns the same data independent on #frames, i.e. the concept + // of the mini-batch is not defined in here, but on the caller side. The caller + // can retrieve the frames of a mini-batch in chunks that do not match the + // caller's definition of "mini-batch," e.g. bigger or smaller chunks. + // If a requested mini-batch spans a sweep boundary, then this function will + // not return samples after the sweep boundary. Instead, the returned frame + // set is shortened to not exceed the end of the sweep. The caller must make + // a separate second call to get the rest. In trainlayer(), the one + // sweep-boundary-spanning mini-batch will simply be shortened. + // This function is NOT thread-safe (due to caching of random sequence). + bool getbatch (const size_t globalts, const size_t framesrequested, msra::dbn::matrix & feat, std::vector & uids, + std::vector> & transcripts, + std::vector> & latticepairs) + { + auto_timer timergetbatch; + + transcripts.clear(); // word-level transcripts not supported by frame source (aimed at MMI) + latticepairs.clear(); // neither are lattices + + assert (totalframes() > 0); + const size_t sweep = globalts / totalframes(); // which sweep (this determines randomization) + const size_t ts = globalts % totalframes(); // start frame within the sweep + const size_t te = min (ts + framesrequested, totalframes()); // do not go beyond sweep boundary + assert (te > ts); + if (verbosity >= 2) + fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep); + + // get random sequence (each time index occurs exactly once) + // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes. + const auto & tmap = randomordering (sweep); + + // page in the needed range of frames + const size_t extent = augmentationextent (frames.dim(), vdim); + bool readfromdisk = frames.require (randomordering.bounds (max (ts, extent) - extent, te + 1 + extent)); + + // generate features and uids + feat.resize (vdim, te - ts); // note: special mode vdim == 0 means no features to be loaded + if (issupervised()) // empty means unsupervised training -> return empty uids + uids.resize (te - ts); + else + uids.clear(); + for (size_t t = ts; t < te; t++) + { + size_t trand = tmap[t]; // the random-sequence sample point for this point in time + if (vdim != 0) + { + auto v_t = feat.col(t-ts); // the vector to fill in + augmentneighbors (frames, boundaryflags, trand, v_t); + } + if (issupervised()) + uids[t-ts] = classids[trand]; + } + timegetbatch = timergetbatch; + return readfromdisk; + } + + bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids, + std::vector> & transcripts, + std::vector> & latticepairs) + { + // for single input/output set size to be 1 and run old getbatch + feat.resize(1); + uids.resize(1); + //transcripts.resize(1); + //latticepairs.resize(1); + return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs); + } + + double gettimegetbatch () { return timegetbatch;} + + // return first valid globalts to ask getbatch() for + // In frame mode, there is no constraint, i.e. it is 'globalts' itself. + /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; } + + /*implement*/ const std::vector & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); static std::vector x; return x;/*keep compiler happy*/ } + }; + + // --------------------------------------------------------------------------- + // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches + // this is derived from minibatchframesource but worked with multiple inputs and/or outputs + // by making "frames" and "classids" a vector of vectors + // --------------------------------------------------------------------------- + class minibatchframesourcemulti : public minibatchsource + { + std::vector vdim; // feature dimension after augmenting neighhors (0: don't read features) + std::vector leftcontext; // number of frames to the left of the target frame in the context window + std::vector rightcontext; // number of frames to the right of the target frame in the context window + unsigned int sampperiod; // (for reference and to check against model) + string featkind; + size_t featdim; + size_t maxvdim; + // cache + //std::vector frames; + std::vector> pframes; // [t][i] all features concatenated + std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors()) + std::vector> classids; // [t] the state that the frame belongs to + size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels + msra::dbn::randomordering randomordering; // [t] -> t' + double timegetbatch; + int verbosity; + + public: + // constructor + // Pass empty labels to denote unsupervised training (so getbatch() will not return uids). + minibatchframesourcemulti (const std::vector> & infiles, const std::vector>> & labels, + std::vector vdim, std::vector udim, std::vector leftcontext, std::vector rightcontext, size_t randomizationrange, const std::vector & pagepath, const bool mayhavenoframe=false, int addEnergy=0) + : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), numframes (0), timegetbatch (0), verbosity(2), maxvdim(0) + { + + if (vdim[0] == 0 && labels.empty()) + throw runtime_error ("minibatchframesourcemulti: when running without features, labels are needed"); + // at this stage, we simply page in the entire training set at once and work off RAM + // We will benefit from feature archives indirectly through htkfeatio. + // TODO: + // - infiles must specify time range + // - at this stage only reserve() (we know the time range; allocate second-layer structure) + // - implement block-wise paging directly from HTK feature files through htkfeatreader + featkind.clear(); + std::vector frame; + std::vectornumclasses; // number of units found (actually max id +1) + size_t notfound = 0; // number of entries missing in MLF + + + std::vectorframesaccum; + + if (infiles.size()==0) + throw runtime_error("minibatchframesourcemulti: need at least one network input specified with features"); + + if (labels.size()==0) + fprintf(stderr,"no MLF label files detected\n"); + + foreach_index (i, infiles) + { + pframes.push_back(unique_ptr(new biggrowablevectorarray(pagepath[i]))); + + if (vdim[i]>maxvdim) + maxvdim=vdim[i]; + } + + + foreach_index (i, labels) + { + classids.push_back(std::vector()); + numclasses.push_back(0); + } + + + fprintf (stderr, "minibatchframesourcemulti: reading %d feature sets and %d label sets...", infiles.size(),labels.size()); + + foreach_index (m, infiles) + { + + + featdim=0; + numframes=0; + featkind.clear(); + msra::asr::htkfeatreader reader; // feature reader + reader.AddEnergy(addEnergy); + + foreach_index (i, infiles[m]) // read each feature file in set m + { + if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); } + msra::basetypes::matrix feat; + msra::asr::htkfeatreader::parsedpath ppath (infiles[m][i]); + + // skip files for which labels don't exist (assuming bad alignment) + wstring key; + if (!labels.empty()) + { + if (!labels[0].empty()) // empty means unsupervised mode (don't load any) + { + key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) + if (labels[0].find (key) == labels[0].end()) + { + if (notfound < 5) + fprintf (stderr, "\nminibatchframesourcemulti: %d-th file not found in MLF label set: %S", i, key.c_str()); + notfound++; + continue; // skip this utterance at all + } + } + } + // get feature frames + if (vdim[m] != 0) // (vdim == special mode to not read features at all) + { + msra::util::attempt (5, [&]() + { + reader.read (ppath, featkind, sampperiod, feat); // whole file read as columns of feature vectors + }); + if (featdim == 0) // first time + featdim = feat.rows(); + else if (featdim != feat.rows()) + throw std::runtime_error ("minibatchframesourcemulti: inconsistent feature dimension across files"); + // HVite occasionally generates mismatching output --skip such files + if (!key.empty()) // (we have a key if supervised mode) + { + const auto & labseq = labels[0].find (key)->second; // (we already checked above that it exists) + size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes); + if (abs ((int) labframes - (int) feat.cols()) > 0) + { + fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str()); + notfound++; + continue; // skip this utterance at all + } + } + // append to cache + frame.resize (featdim); + if (feat.cols() < 2) // (2 frames needed for boundary markers) + throw std::runtime_error ("minibatchframesourcemulti: utterances < 2 frames not supported"); + foreach_column (t, feat) + { + foreach_index (k, frame) + frame[k] = feat(k,t); + + pframes[m]->push_back (frame); + numframes++; + if (m==0) + boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0); + } + if (m==0) + framesaccum.push_back(numframes); + else + assert(numframes == framesaccum[i]); + + assert (numframes == pframes[m]->size()); + } + if (m==0) + assert (numframes == boundaryflags.size()); + + + + if (m==0) // after we get the key for this file, read all labels (only done for first feature) + { + if (!key.empty()) + { + foreach_index (j, labels) + { + const auto & labseq = labels[j].find (key)->second; // (we already checked above that it exists) + foreach_index (i, labseq) + { + const auto & e = labseq[i]; + if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0)) + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: labels not in consecutive order MLF in label set: %S", key.c_str())); + for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++) + { + if (e.classid >= udim[j]) + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: class id exceeds model dimension in file %S", key.c_str())); + if (e.classid != (CLASSIDTYPE) e.classid) + throw std::runtime_error ("CLASSIDTYPE has too few bits"); + classids[j].push_back ((CLASSIDTYPE) e.classid); + numclasses[j] = max (numclasses[j], 1u + e.classid); + } + } + if (vdim[m] == 0) + numframes = classids[j].size(); + if (numframes != classids[j].size()) // TODO: remove this once we are confident + throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: label duration inconsistent with feature file in MLF label set: %S", key.c_str())); + assert (numframes == classids[j].size()); + + } + } + else + { + assert(classids.empty()); + } + + } + + } + + + assert (vdim[m] == 0 || numframes == pframes[m]->size()); + + foreach_index(j, labels) + assert (labels[j].empty() || numframes == classids[j].size()); + + if (vdim[m] != 0 && numframes != pframes[m]->size()) // || (!labels.empty() && numframes != classids.size())) + throw std::runtime_error ("\nminibatchframesource: numframes variable screwup"); + if (m==0) + { + foreach_index (j, numclasses) + fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %d classes\n", j, numclasses[j]); + } + fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %d frames read from %d utterances\n", m, pframes[m]->size(), infiles[m].size()); + if (notfound > 0) + { + fprintf (stderr, "minibatchframesourcemulti: %d files out of %d not found in label set\n", notfound, infiles[m].size()); + if (notfound > infiles[m].size() / 2) + throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n"); + } + // notify frames source to switch from population to consumption mode + pframes[m]->no_more_push_back(); + + } + + if (numframes == 0 && !mayhavenoframe) + throw std::runtime_error ("minibatchframesource: no input features given!"); + + + // initialize randomizer + if (numframes > 0) + randomordering.resize (numframes, randomizationrange); + + } + virtual ~minibatchframesourcemulti() {} + size_t totalframes() const { + assert (maxvdim == 0 || numframes == pframes[0]->size()); assert (!issupervised() || numframes == classids[0].size()); return numframes; } + + bool issupervised() const { return !classids.empty(); } + + void setverbosity(int newverbosity) { verbosity = newverbosity; } + + // retrieve one minibatch + // Minibatches are deterministic pseudo-random samples. The entire corpus + // is repeated infinitely, but each repetition (a 'sweep') is randomized + // differently. + // This function allows to retrieve a mini-batch starting from any frame + // within this infinitely extended repetition. To the end, mini-batches are + // specified by start frame and #frames. + // This function returns the same data independent on #frames, i.e. the concept + // of the mini-batch is not defined in here, but on the caller side. The caller + // can retrieve the frames of a mini-batch in chunks that do not match the + // caller's definition of "mini-batch," e.g. bigger or smaller chunks. + // If a requested mini-batch spans a sweep boundary, then this function will + // not return samples after the sweep boundary. Instead, the returned frame + // set is shortened to not exceed the end of the sweep. The caller must make + // a separate second call to get the rest. In trainlayer(), the one + // sweep-boundary-spanning mini-batch will simply be shortened. + // This function is NOT thread-safe (due to caching of random sequence). + bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids, + std::vector> & transcripts, + std::vector> & latticepairs) + { + + auto_timer timergetbatch; + bool readfromdisk; + size_t nreadfromdisk=0; + transcripts.clear(); // word-level transcripts not supported by frame source (aimed at MMI) + latticepairs.clear(); // neither are lattices + + assert (totalframes() > 0); + const size_t sweep = globalts / totalframes(); // which sweep (this determines randomization) + const size_t ts = globalts % totalframes(); // start frame within the sweep + const size_t te = min (ts + framesrequested, totalframes()); // do not go beyond sweep boundary + assert (te > ts); + if (verbosity >= 2) + fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep); + + // get random sequence (each time index occurs exactly once) + // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes. + const auto & tmap = randomordering (sweep); + + feat.resize(pframes.size()); + uids.resize(classids.size()); + foreach_index(i, feat) + { + size_t leftextent, rightextent; + // page in the needed range of frames + if (leftcontext[i] == 0 && rightcontext[i] == 0) + { + leftextent = rightextent = augmentationextent(pframes[i]->dim(), vdim[i]); + } + else + { + leftextent = leftcontext[i]; + rightextent = rightcontext[i]; + } + readfromdisk = pframes[i]->require (randomordering.bounds (max (ts, leftextent) - leftextent, te + 1 + rightextent)); + // generate features and uids + feat[i].resize (vdim[i], te - ts); // note: special mode vdim == 0 means no features to be loaded + if (issupervised()) // empty means unsupervised training -> return empty uids + foreach_index(j, uids) + uids[j].resize (te - ts); + else + uids.clear(); + + for (size_t t = ts; t < te; t++) + { + size_t trand = tmap[t]; // the random-sequence sample point for this point in time + if (vdim[i] != 0) + { + auto v_t = feat[i].col(t-ts); // the vector to fill in + augmentneighbors (*pframes[i], boundaryflags, trand, leftextent, rightextent, v_t); + } + if (i==0){ // read labels for all outputs on first pass thru features. this guarantees they will be read if only one feature set but > 1 label set + if (issupervised()) + foreach_index(j, uids) + uids[j][t-ts] = classids[j][trand]; + } + } + timegetbatch = timergetbatch; + if (readfromdisk) + nreadfromdisk++; + + } + + (nreadfromdisk==feat.size()) ? readfromdisk = true : readfromdisk = false; + + return readfromdisk; + + } + + bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector & /*uids*/, + std::vector> & /*transcripts*/, + std::vector> & /*latticepairs*/) + { + // should never get here + throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchframesource instead\n"); + } + + double gettimegetbatch () { return timegetbatch;} + + // return first valid globalts to ask getbatch() for + // In frame mode, there is no constraint, i.e. it is 'globalts' itself. + /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; } + + /*implement*/ const std::vector & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); } + + }; +};}; diff --git a/DataReader/HTKMLFReader/utterancesource.h b/DataReader/HTKMLFReader/utterancesource.h index 672cb72c4..982269667 100644 --- a/DataReader/HTKMLFReader/utterancesource.h +++ b/DataReader/HTKMLFReader/utterancesource.h @@ -768,6 +768,7 @@ private: if (chunkdata.isinram()) return false; + if (verbosity) fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1); msra::util::attempt (5, [&]() // (reading from network) { @@ -858,6 +859,7 @@ public: transcripts.clear(); // return these utterances + if (verbosity > 0) fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep); size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch for (size_t pos = spos; pos < epos; pos++) @@ -922,6 +924,7 @@ public: const size_t lastchunk = chunkforframepos (globalte-1); const size_t windowbegin = randomizedchunks[firstchunk].windowbegin; const size_t windowend = randomizedchunks[lastchunk].windowend; + if (verbosity > 0) fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n", globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend); // release all data outside, and page in all data inside diff --git a/DataReader/HTKMLFReader/utterancesourcemulti.h b/DataReader/HTKMLFReader/utterancesourcemulti.h index 1e97242a7..510e7bc32 100644 --- a/DataReader/HTKMLFReader/utterancesourcemulti.h +++ b/DataReader/HTKMLFReader/utterancesourcemulti.h @@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource bool isinram() const { return !frames.empty(); } // page in data for this chunk // We pass in the feature info variables by ref which will be filled lazily upon first read - void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const + void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const { if (numutterances() == 0) throw std::logic_error ("requiredata: cannot page in virgin block"); @@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols()); } //fprintf (stderr, "\n"); + if (verbosity) fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size()); } catch (...) @@ -568,6 +569,7 @@ private: return sweep; currentsweep = sweep; + if (verbosity>0) fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance"); const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep @@ -919,10 +921,11 @@ private: { auto & chunk = randomizedchunks[m][chunkindex]; auto & chunkdata = chunk.getchunkdata(); + if (verbosity) fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1); msra::util::attempt (5, [&]() // (reading from network) { - chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices); + chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity); }); } chunksinram++; @@ -1029,7 +1032,8 @@ public: } } // return these utterances - fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep); + if (verbosity > 0) + fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep); size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch for (size_t pos = spos; pos < epos; pos++) { @@ -1107,6 +1111,7 @@ public: const size_t lastchunk = chunkforframepos (globalte-1); const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin; const size_t windowend = randomizedchunks[0][lastchunk].windowend; + if (verbosity) fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n", globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend); // release all data outside, and page in all data inside @@ -1230,3 +1235,4 @@ public: }; };}; + diff --git a/DataReader/SequenceReader/SequenceParser.h b/DataReader/SequenceReader/SequenceParser.h index 1226aaaf5..9115b2a00 100644 --- a/DataReader/SequenceReader/SequenceParser.h +++ b/DataReader/SequenceReader/SequenceParser.h @@ -1,616 +1,616 @@ -// SequenceParser.h : Parses the UCI format using a custom state machine (for speed) -// -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - -#include -#include -#include -#include -#include -#include - -using namespace std; - -#define MAXSTRING 2048 -// UCI label location types -enum LabelMode -{ - LabelNone = 0, - LabelFirst = 1, - LabelLast = 2, -}; - -enum ParseMode -{ - ParseNormal = 0, - ParseLineCount = 1 -}; - -enum SequenceFlags -{ - seqFlagNull = 0, - seqFlagLineBreak = 1, // line break on the parsed line - seqFlagEmptyLine = 2, // empty line - seqFlagStartLabel = 4, - seqFlagStopLabel = 8 -}; - -// SequencePosition, save the ending indexes into the array for a sequence -struct SequencePosition -{ - size_t numberPos; // max position in the number array for this sequence - size_t labelPos; // max position in the label array for this sequence - unsigned flags; // flags that apply to this sequence - SequencePosition(size_t numPos, size_t labelPos, unsigned flags): - numberPos(numPos), labelPos(labelPos), flags(flags) - {} -}; - -// SequenceParser - the parser for the UCI format files -// for ultimate speed, this class implements a state machine to read these format files -template -class SequenceParser -{ -protected: - enum ParseState - { - WholeNumber = 0, - Remainder = 1, - Exponent = 2, - Whitespace = 3, - Sign = 4, - ExponentSign = 5, - Period = 6, - TheLetterE = 7, - EndOfLine = 8, - Label = 9, // any non-number things we run into - ParseStateMax = 10, // number of parse states - LineCountEOL = 10, - LineCountOther = 11, - AllStateMax = 12 - }; - - // type of label processing - ParseMode m_parseMode; - - // definition of label and feature dimensions - size_t m_dimFeatures; - - size_t m_dimLabelsIn; - std::string m_beginSequenceIn; // starting sequence string (i.e. ) - std::string m_endSequenceIn; // ending sequence string (i.e. ) - - size_t m_dimLabelsOut; - std::string m_beginSequenceOut; // starting sequence string (i.e. 'O') - std::string m_endSequenceOut; // ending sequence string (i.e. 'O') - - // level of screen output - int m_traceLevel; - - // current state of the state machine - ParseState m_current_state; - - // state tables - DWORD *m_stateTable; - - // numeric state machine variables - double m_partialResult; - double m_builtUpNumber; - double m_divider; - double m_wholeNumberMultiplier; - double m_exponentMultiplier; - - // label state machine variables - size_t m_spaceDelimitedStart; - size_t m_spaceDelimitedMax; // start of the next whitespace sequence (one past the end of the last word) - int m_numbersConvertedThisLine; - int m_labelsConvertedThisLine; - int m_elementsConvertedThisLine; - - // sequence state machine variables - bool m_beginSequence; - bool m_endSequence; - std::string m_beginTag; - std::string m_endTag; - - // global stats - int m_totalNumbersConverted; - int m_totalLabelsConverted; - - // file positions/buffer - FILE * m_pFile; - int64_t m_byteCounter; - int64_t m_fileSize; - - BYTE * m_fileBuffer; - size_t m_bufferStart; - size_t m_bufferSize; - - // last label was a string (for last label processing) - bool m_lastLabelIsString; - - // vectors to append to - std::vector* m_numbers; // pointer to vectors to append with numbers - std::vector* m_labels; // pointer to vector to append with labels (may be numeric) - // FUTURE: do we want a vector to collect string labels in the non string label case? (signifies an error) - - // SetState for a particular value - void SetState(int value, ParseState m_current_state, ParseState next_state); - - // SetStateRange - set states transitions for a range of values - void SetStateRange(int value1, int value2, ParseState m_current_state, ParseState next_state); - - // SetupStateTables - setup state transition tables for each state - // each state has a block of 256 states indexed by the incoming character - void SetupStateTables(); - - // reset all line state variables - void PrepareStartLine(); - - // reset all number accumulation variables - void PrepareStartNumber(); - - // reset all state variables to start reading at a new position - void PrepareStartPosition(size_t position); - - // UpdateBuffer - load the next buffer full of data - // returns - number of records read - size_t UpdateBuffer(); - -public: - - // SequenceParser constructor - SequenceParser(); - // setup all the state variables and state tables for state machine - void Init(); - - // Parser destructor - ~SequenceParser(); - -private: - // DoneWithLabel - Called when a string label is found - void DoneWithLabel(); - - // Called when a number is complete - void DoneWithValue(); - - // store label is specialized by LabelType - void StoreLabel(NumType value); - - // StoreLastLabel - store the last label (for numeric types), tranfers to label vector - // string label types handled in specialization - void StoreLastLabel(); - -public: - // SetParseMode - Set the parsing mode - // mode - set mode to either ParseLineCount, or ParseNormal - void SetParseMode(ParseMode mode); - - // SetTraceLevel - Set the level of screen output - // traceLevel - traceLevel, zero means no output, 1 epoch related output, > 1 all output - void SetTraceLevel(int traceLevel); - - - // ParseInit - Initialize a parse of a file - // fileName - path to the file to open - // dimFeatures - number of features for precomputed features - // dimLabelsIn - number of lables possible on input - // dimLabelsOut - number of labels possible on output - // beginSequenceIn - beginSequence input label - // endSequenceIn - endSequence input label - // beginSequenceOut - beginSequence output label - // endSequenceOut - endSequence output label - // bufferSize - size of temporary buffer to store reads - // startPosition - file position on which we should start - void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O", size_t bufferSize=1024*256, size_t startPosition=0) - { - assert(fileName != NULL); - m_dimFeatures = dimFeatures; - m_dimLabelsIn = dimLabelsIn; - m_beginSequenceIn = beginSequenceIn; - m_endSequenceIn = endSequenceIn; - m_dimLabelsOut = dimLabelsOut; - m_beginSequenceOut = beginSequenceOut; - m_endSequenceOut = endSequenceOut; - - m_parseMode = ParseNormal; - m_traceLevel = 0; - m_bufferSize = bufferSize; - m_bufferStart = startPosition; - - m_beginTag = m_beginSequenceIn; - m_endTag = m_endSequenceIn; - - // if we have a file already open, cleanup - if (m_pFile != NULL) - SequenceParser::~SequenceParser(); - - errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" ); - if (err) - RuntimeError("SequenceParser::ParseInit - error opening file"); - int rc = _fseeki64(m_pFile, 0, SEEK_END); - if (rc) - RuntimeError("SequenceParser::ParseInit - error seeking in file"); - - m_fileSize = GetFilePosition(); - m_fileBuffer = new BYTE[m_bufferSize]; - SetFilePosition(startPosition); - } - - // Parse - Parse the data - // recordsRequested - number of records requested - // labels - pointer to vector to return the labels - // numbers - pointer to vector to return the numbers - // seqPos - pointers to the other two arrays showing positions of each sequence - // returns - number of records actually read, if the end of file is reached the return value will be < requested records - long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos) - { - assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount); - assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount); - - // transfer to member variables - m_numbers = numbers; - m_labels = labels; - - long TickStart = GetTickCount( ); - long recordCount = 0; - long lineCount = 0; - size_t bufferIndex = m_byteCounter-m_bufferStart; - SequencePosition sequencePositionLast(0,0,seqFlagNull); - while (m_byteCounter < m_fileSize && recordCount < recordsRequested) - { - // check to see if we need to update the buffer - if (bufferIndex >= m_bufferSize) - { - UpdateBuffer(); - bufferIndex = m_byteCounter-m_bufferStart; - } - - char ch = m_fileBuffer[bufferIndex]; - - ParseState nextState = (ParseState)m_stateTable[(m_current_state<<8)+ch]; - - if( nextState <= Exponent ) - { - m_builtUpNumber = m_builtUpNumber * 10 + (ch - '0'); - // if we are in the decimal portion of a number increase the divider - if (nextState == Remainder) - m_divider *= 10; - } - - // only do a test on a state transition - if (m_current_state != nextState) - { - // System.Diagnostics.Debug.WriteLine("Current state = " + m_current_state + ", next state = " + nextState); - - // if the nextState is a label, we don't want to do any number processing, it's a number prefixed string - if (nextState != Label) - { - // do the numeric processing - switch (m_current_state) - { - case TheLetterE: - if (m_divider != 0) // decimal number - m_partialResult += m_builtUpNumber / m_divider; - else // integer - m_partialResult = m_builtUpNumber; - m_builtUpNumber = 0; - break; - case WholeNumber: - // could be followed by a remainder, or an exponent - if (nextState != TheLetterE) - if( nextState != Period) - DoneWithValue(); - if (nextState == Period) - { - m_partialResult = m_builtUpNumber; - m_divider = 1; - m_builtUpNumber = 0; - } - break; - case Remainder: - // can only be followed by a exponent - if (nextState != TheLetterE) - DoneWithValue(); - break; - case Exponent: - DoneWithValue(); - break; - } - } - - // label handling - switch (m_current_state) - { - case Label: - DoneWithLabel(); - break; - case EndOfLine: - if (seqPos) - { - SequencePosition sequencePos(numbers->size(), labels->size(), - m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); - // add a sequence element to the list - seqPos->push_back(sequencePos); - sequencePositionLast = sequencePos; - } - - // end of sequence determines record separation - if (m_endSequence) - recordCount = (long)labels->size(); - - PrepareStartLine(); - break; - case Whitespace: - // this is the start of the next space delimited entity - if (nextState != EndOfLine) - m_spaceDelimitedStart = m_byteCounter; - break; - } - - // label handling for next state - switch (nextState) - { - // do sign processing on nextState, since we still have the character handy - case Sign: - if (ch == '-') - m_wholeNumberMultiplier = -1; - break; - case ExponentSign: - if (ch == '-') - m_exponentMultiplier = -1; - break; - // going into whitespace or endOfLine, so end of space delimited entity - case Whitespace: - m_spaceDelimitedMax = m_byteCounter; - // hit whitespace and nobody processed anything, so add as label - //if (m_elementsConvertedThisLine == elementsProcessed) - // DoneWithLabel(); - break; - case EndOfLine: - if (m_current_state != Whitespace) - { - m_spaceDelimitedMax = m_byteCounter; - // hit whitespace and nobody processed anything, so add as label - //if (m_elementsConvertedThisLine == elementsProcessed) - // DoneWithLabel(); - } - // process the label at the end of a line - //if (m_labelMode == LabelLast && m_labels != NULL) - //{ - // StoreLastLabel(); - //} - // intentional fall-through - case LineCountEOL: - lineCount++; // done with another record - if (m_traceLevel > 1) - { - // print progress dots - if (recordCount % 100 == 0) - { - if (recordCount % 1000 == 0) - { - if (recordCount % 10000 == 0) - { - fprintf(stderr, "#"); - } - else - { - fprintf(stderr, "+"); - } - } - else - { - fprintf(stderr, "."); - } - } - } - break; - case LineCountOther: - m_spaceDelimitedStart = m_byteCounter; - break; - } - } - - m_current_state = nextState; - - // move to next character - m_byteCounter++; - bufferIndex++; - } // while - - // at the end of the file we may need to add an additional sequencePosition push - // this could probably be fixed by taking another pass through the loop above, but this is easier - if (seqPos) - { - SequencePosition sequencePos(numbers->size(), labels->size(), - m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); - // add the final sequence element if needed - if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos)) - { - seqPos->push_back(sequencePos); - } - } - - long TickStop = GetTickCount( ); - - long TickDelta = TickStop - TickStart; - - if (m_traceLevel > 2) - fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted ); - return lineCount; - } - - - int64_t GetFilePosition(); - void SetFilePosition(int64_t position); - - // HasMoreData - test if the current dataset have more data - // returns - true if it does, false if not - bool HasMoreData(); -}; - -// StoreLabel - string version gets last space delimited string and stores in labels vector -template <> -void SequenceParser::StoreLabel(float finalResult); - -// DoneWithLabel - string version stores string label -template <> -void SequenceParser::DoneWithLabel(); - -// StoreLastLabel - string version -template <> -void SequenceParser::StoreLastLabel(); - -// NOTE: Current code is identical to float, don't know how to specialize with template parameter that only covers one parameter - -// StoreLabel - string version gets last space delimited string and stores in labels vector -template <> -void SequenceParser::StoreLabel(double finalResult); - -// DoneWithLabel - string version stores string label -template <> -void SequenceParser::DoneWithLabel(); - -// StoreLastLabel - string version -template <> -void SequenceParser::StoreLastLabel(); - -/// language model sequence parser -template -class LMSequenceParser : public SequenceParser -{ -protected: - FILE * mFile; - std::wstring mFileName; - -public: - LMSequenceParser() { - mFile = nullptr; - }; - ~LMSequenceParser() { - if (mFile) fclose(mFile); - } - - void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O") - { - assert(fileName != NULL); - mFileName = fileName; - m_dimFeatures = dimFeatures; - m_dimLabelsIn = dimLabelsIn; - m_beginSequenceIn = beginSequenceIn; - m_endSequenceIn = endSequenceIn; - m_dimLabelsOut = dimLabelsOut; - m_beginSequenceOut = beginSequenceOut; - m_endSequenceOut = endSequenceOut; - - m_parseMode = ParseNormal; - m_traceLevel = 0; - m_bufferSize = 0; - m_bufferStart = 0; - - m_beginTag = m_beginSequenceIn; - m_endTag = m_endSequenceIn; - - m_fileSize = -1; - m_fileBuffer = NULL; - - if (mFile) fclose(mFile); - - if (_wfopen_s(&mFile, fileName, L"rt") != 0) - RuntimeError("cannot open file %s", fileName); - } - - void ParseReset() - { - if (mFile) fseek(mFile, 0, SEEK_SET); - } - - // Parse - Parse the data - // recordsRequested - number of records requested - // labels - pointer to vector to return the labels - // numbers - pointer to vector to return the numbers - // seqPos - pointers to the other two arrays showing positions of each sequence - // returns - number of records actually read, if the end of file is reached the return value will be < requested records - long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos) - { - assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount); - assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount); - - // transfer to member variables - m_numbers = numbers; - m_labels = labels; - - long TickStart = GetTickCount( ); - long recordCount = 0; - long orgRecordCount = (long)labels->size(); - long lineCount = 0; - SequencePosition sequencePositionLast(0,0,seqFlagNull); - /// get line - char ch2[MAXSTRING]; - while (recordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr) - { - - string ch = ch2; - std::vector vstr; - vstr = sep_string(ch, " "); - if (vstr.size() < 3) - continue; - - for (size_t i = 0; i < vstr.size(); i++) - { - labels->push_back(vstr[i]); - } - SequencePosition sequencePos(numbers->size(), labels->size(), - m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); - // add a sequence element to the list - seqPos->push_back(sequencePos); - sequencePositionLast = sequencePos; - - recordCount = (long)labels->size() - orgRecordCount; - - lineCount ++; - } // while - - long TickStop = GetTickCount( ); - - long TickDelta = TickStop - TickStart; - - if (m_traceLevel > 2) - fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted ); - return lineCount; - } - -}; - -typedef struct{ - size_t sLen; - size_t sBegin; - size_t sEnd; -} stSentenceInfo; -/// language model sequence parser -template -class LMBatchSequenceParser: public LMSequenceParser -{ -public: - vector mSentenceIndex2SentenceInfo; - -public: - LMBatchSequenceParser() { }; - ~LMBatchSequenceParser() { } - - void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O"); - - // Parse - Parse the data - // recordsRequested - number of records requested - // labels - pointer to vector to return the labels - // numbers - pointer to vector to return the numbers - // seqPos - pointers to the other two arrays showing positions of each sequence - // returns - number of records actually read, if the end of file is reached the return value will be < requested records - long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos); - -}; +// SequenceParser.h : Parses the UCI format using a custom state machine (for speed) +// +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define MAXSTRING 500000 +// UCI label location types +enum LabelMode +{ + LabelNone = 0, + LabelFirst = 1, + LabelLast = 2, +}; + +enum ParseMode +{ + ParseNormal = 0, + ParseLineCount = 1 +}; + +enum SequenceFlags +{ + seqFlagNull = 0, + seqFlagLineBreak = 1, // line break on the parsed line + seqFlagEmptyLine = 2, // empty line + seqFlagStartLabel = 4, + seqFlagStopLabel = 8 +}; + +// SequencePosition, save the ending indexes into the array for a sequence +struct SequencePosition +{ + size_t numberPos; // max position in the number array for this sequence + size_t labelPos; // max position in the label array for this sequence + unsigned flags; // flags that apply to this sequence + SequencePosition(size_t numPos, size_t labelPos, unsigned flags): + numberPos(numPos), labelPos(labelPos), flags(flags) + {} +}; + +// SequenceParser - the parser for the UCI format files +// for ultimate speed, this class implements a state machine to read these format files +template +class SequenceParser +{ +protected: + enum ParseState + { + WholeNumber = 0, + Remainder = 1, + Exponent = 2, + Whitespace = 3, + Sign = 4, + ExponentSign = 5, + Period = 6, + TheLetterE = 7, + EndOfLine = 8, + Label = 9, // any non-number things we run into + ParseStateMax = 10, // number of parse states + LineCountEOL = 10, + LineCountOther = 11, + AllStateMax = 12 + }; + + // type of label processing + ParseMode m_parseMode; + + // definition of label and feature dimensions + size_t m_dimFeatures; + + size_t m_dimLabelsIn; + std::string m_beginSequenceIn; // starting sequence string (i.e. ) + std::string m_endSequenceIn; // ending sequence string (i.e. ) + + size_t m_dimLabelsOut; + std::string m_beginSequenceOut; // starting sequence string (i.e. 'O') + std::string m_endSequenceOut; // ending sequence string (i.e. 'O') + + // level of screen output + int m_traceLevel; + + // current state of the state machine + ParseState m_current_state; + + // state tables + DWORD *m_stateTable; + + // numeric state machine variables + double m_partialResult; + double m_builtUpNumber; + double m_divider; + double m_wholeNumberMultiplier; + double m_exponentMultiplier; + + // label state machine variables + size_t m_spaceDelimitedStart; + size_t m_spaceDelimitedMax; // start of the next whitespace sequence (one past the end of the last word) + int m_numbersConvertedThisLine; + int m_labelsConvertedThisLine; + int m_elementsConvertedThisLine; + + // sequence state machine variables + bool m_beginSequence; + bool m_endSequence; + std::string m_beginTag; + std::string m_endTag; + + // global stats + int m_totalNumbersConverted; + int m_totalLabelsConverted; + + // file positions/buffer + FILE * m_pFile; + int64_t m_byteCounter; + int64_t m_fileSize; + + BYTE * m_fileBuffer; + size_t m_bufferStart; + size_t m_bufferSize; + + // last label was a string (for last label processing) + bool m_lastLabelIsString; + + // vectors to append to + std::vector* m_numbers; // pointer to vectors to append with numbers + std::vector* m_labels; // pointer to vector to append with labels (may be numeric) + // FUTURE: do we want a vector to collect string labels in the non string label case? (signifies an error) + + // SetState for a particular value + void SetState(int value, ParseState m_current_state, ParseState next_state); + + // SetStateRange - set states transitions for a range of values + void SetStateRange(int value1, int value2, ParseState m_current_state, ParseState next_state); + + // SetupStateTables - setup state transition tables for each state + // each state has a block of 256 states indexed by the incoming character + void SetupStateTables(); + + // reset all line state variables + void PrepareStartLine(); + + // reset all number accumulation variables + void PrepareStartNumber(); + + // reset all state variables to start reading at a new position + void PrepareStartPosition(size_t position); + + // UpdateBuffer - load the next buffer full of data + // returns - number of records read + size_t UpdateBuffer(); + +public: + + // SequenceParser constructor + SequenceParser(); + // setup all the state variables and state tables for state machine + void Init(); + + // Parser destructor + ~SequenceParser(); + +private: + // DoneWithLabel - Called when a string label is found + void DoneWithLabel(); + + // Called when a number is complete + void DoneWithValue(); + + // store label is specialized by LabelType + void StoreLabel(NumType value); + + // StoreLastLabel - store the last label (for numeric types), tranfers to label vector + // string label types handled in specialization + void StoreLastLabel(); + +public: + // SetParseMode - Set the parsing mode + // mode - set mode to either ParseLineCount, or ParseNormal + void SetParseMode(ParseMode mode); + + // SetTraceLevel - Set the level of screen output + // traceLevel - traceLevel, zero means no output, 1 epoch related output, > 1 all output + void SetTraceLevel(int traceLevel); + + + // ParseInit - Initialize a parse of a file + // fileName - path to the file to open + // dimFeatures - number of features for precomputed features + // dimLabelsIn - number of lables possible on input + // dimLabelsOut - number of labels possible on output + // beginSequenceIn - beginSequence input label + // endSequenceIn - endSequence input label + // beginSequenceOut - beginSequence output label + // endSequenceOut - endSequence output label + // bufferSize - size of temporary buffer to store reads + // startPosition - file position on which we should start + void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O", size_t bufferSize=1024*256, size_t startPosition=0) + { + assert(fileName != NULL); + m_dimFeatures = dimFeatures; + m_dimLabelsIn = dimLabelsIn; + m_beginSequenceIn = beginSequenceIn; + m_endSequenceIn = endSequenceIn; + m_dimLabelsOut = dimLabelsOut; + m_beginSequenceOut = beginSequenceOut; + m_endSequenceOut = endSequenceOut; + + m_parseMode = ParseNormal; + m_traceLevel = 0; + m_bufferSize = bufferSize; + m_bufferStart = startPosition; + + m_beginTag = m_beginSequenceIn; + m_endTag = m_endSequenceIn; + + // if we have a file already open, cleanup + if (m_pFile != NULL) + SequenceParser::~SequenceParser(); + + errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" ); + if (err) + RuntimeError("SequenceParser::ParseInit - error opening file"); + int rc = _fseeki64(m_pFile, 0, SEEK_END); + if (rc) + RuntimeError("SequenceParser::ParseInit - error seeking in file"); + + m_fileSize = GetFilePosition(); + m_fileBuffer = new BYTE[m_bufferSize]; + SetFilePosition(startPosition); + } + + // Parse - Parse the data + // recordsRequested - number of records requested + // labels - pointer to vector to return the labels + // numbers - pointer to vector to return the numbers + // seqPos - pointers to the other two arrays showing positions of each sequence + // returns - number of records actually read, if the end of file is reached the return value will be < requested records + long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos) + { + assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount); + assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount); + + // transfer to member variables + m_numbers = numbers; + m_labels = labels; + + long TickStart = GetTickCount( ); + long recordCount = 0; + long lineCount = 0; + size_t bufferIndex = m_byteCounter-m_bufferStart; + SequencePosition sequencePositionLast(0,0,seqFlagNull); + while (m_byteCounter < m_fileSize && recordCount < recordsRequested) + { + // check to see if we need to update the buffer + if (bufferIndex >= m_bufferSize) + { + UpdateBuffer(); + bufferIndex = m_byteCounter-m_bufferStart; + } + + char ch = m_fileBuffer[bufferIndex]; + + ParseState nextState = (ParseState)m_stateTable[(m_current_state<<8)+ch]; + + if( nextState <= Exponent ) + { + m_builtUpNumber = m_builtUpNumber * 10 + (ch - '0'); + // if we are in the decimal portion of a number increase the divider + if (nextState == Remainder) + m_divider *= 10; + } + + // only do a test on a state transition + if (m_current_state != nextState) + { + // System.Diagnostics.Debug.WriteLine("Current state = " + m_current_state + ", next state = " + nextState); + + // if the nextState is a label, we don't want to do any number processing, it's a number prefixed string + if (nextState != Label) + { + // do the numeric processing + switch (m_current_state) + { + case TheLetterE: + if (m_divider != 0) // decimal number + m_partialResult += m_builtUpNumber / m_divider; + else // integer + m_partialResult = m_builtUpNumber; + m_builtUpNumber = 0; + break; + case WholeNumber: + // could be followed by a remainder, or an exponent + if (nextState != TheLetterE) + if( nextState != Period) + DoneWithValue(); + if (nextState == Period) + { + m_partialResult = m_builtUpNumber; + m_divider = 1; + m_builtUpNumber = 0; + } + break; + case Remainder: + // can only be followed by a exponent + if (nextState != TheLetterE) + DoneWithValue(); + break; + case Exponent: + DoneWithValue(); + break; + } + } + + // label handling + switch (m_current_state) + { + case Label: + DoneWithLabel(); + break; + case EndOfLine: + if (seqPos) + { + SequencePosition sequencePos(numbers->size(), labels->size(), + m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); + // add a sequence element to the list + seqPos->push_back(sequencePos); + sequencePositionLast = sequencePos; + } + + // end of sequence determines record separation + if (m_endSequence) + recordCount = (long)labels->size(); + + PrepareStartLine(); + break; + case Whitespace: + // this is the start of the next space delimited entity + if (nextState != EndOfLine) + m_spaceDelimitedStart = m_byteCounter; + break; + } + + // label handling for next state + switch (nextState) + { + // do sign processing on nextState, since we still have the character handy + case Sign: + if (ch == '-') + m_wholeNumberMultiplier = -1; + break; + case ExponentSign: + if (ch == '-') + m_exponentMultiplier = -1; + break; + // going into whitespace or endOfLine, so end of space delimited entity + case Whitespace: + m_spaceDelimitedMax = m_byteCounter; + // hit whitespace and nobody processed anything, so add as label + //if (m_elementsConvertedThisLine == elementsProcessed) + // DoneWithLabel(); + break; + case EndOfLine: + if (m_current_state != Whitespace) + { + m_spaceDelimitedMax = m_byteCounter; + // hit whitespace and nobody processed anything, so add as label + //if (m_elementsConvertedThisLine == elementsProcessed) + // DoneWithLabel(); + } + // process the label at the end of a line + //if (m_labelMode == LabelLast && m_labels != NULL) + //{ + // StoreLastLabel(); + //} + // intentional fall-through + case LineCountEOL: + lineCount++; // done with another record + if (m_traceLevel > 1) + { + // print progress dots + if (recordCount % 100 == 0) + { + if (recordCount % 1000 == 0) + { + if (recordCount % 10000 == 0) + { + fprintf(stderr, "#"); + } + else + { + fprintf(stderr, "+"); + } + } + else + { + fprintf(stderr, "."); + } + } + } + break; + case LineCountOther: + m_spaceDelimitedStart = m_byteCounter; + break; + } + } + + m_current_state = nextState; + + // move to next character + m_byteCounter++; + bufferIndex++; + } // while + + // at the end of the file we may need to add an additional sequencePosition push + // this could probably be fixed by taking another pass through the loop above, but this is easier + if (seqPos) + { + SequencePosition sequencePos(numbers->size(), labels->size(), + m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); + // add the final sequence element if needed + if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos)) + { + seqPos->push_back(sequencePos); + } + } + + long TickStop = GetTickCount( ); + + long TickDelta = TickStop - TickStart; + + if (m_traceLevel > 2) + fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted ); + return lineCount; + } + + + int64_t GetFilePosition(); + void SetFilePosition(int64_t position); + + // HasMoreData - test if the current dataset have more data + // returns - true if it does, false if not + bool HasMoreData(); +}; + +// StoreLabel - string version gets last space delimited string and stores in labels vector +template <> +void SequenceParser::StoreLabel(float finalResult); + +// DoneWithLabel - string version stores string label +template <> +void SequenceParser::DoneWithLabel(); + +// StoreLastLabel - string version +template <> +void SequenceParser::StoreLastLabel(); + +// NOTE: Current code is identical to float, don't know how to specialize with template parameter that only covers one parameter + +// StoreLabel - string version gets last space delimited string and stores in labels vector +template <> +void SequenceParser::StoreLabel(double finalResult); + +// DoneWithLabel - string version stores string label +template <> +void SequenceParser::DoneWithLabel(); + +// StoreLastLabel - string version +template <> +void SequenceParser::StoreLastLabel(); + +/// language model sequence parser +template +class LMSequenceParser : public SequenceParser +{ +protected: + FILE * mFile; + std::wstring mFileName; + +public: + LMSequenceParser() { + mFile = nullptr; + }; + ~LMSequenceParser() { + if (mFile) fclose(mFile); + } + + void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O") + { + assert(fileName != NULL); + mFileName = fileName; + m_dimFeatures = dimFeatures; + m_dimLabelsIn = dimLabelsIn; + m_beginSequenceIn = beginSequenceIn; + m_endSequenceIn = endSequenceIn; + m_dimLabelsOut = dimLabelsOut; + m_beginSequenceOut = beginSequenceOut; + m_endSequenceOut = endSequenceOut; + + m_parseMode = ParseNormal; + m_traceLevel = 0; + m_bufferSize = 0; + m_bufferStart = 0; + + m_beginTag = m_beginSequenceIn; + m_endTag = m_endSequenceIn; + + m_fileSize = -1; + m_fileBuffer = NULL; + + if (mFile) fclose(mFile); + + if (_wfopen_s(&mFile, fileName, L"rt") != 0) + RuntimeError("cannot open file %s", fileName); + } + + void ParseReset() + { + if (mFile) fseek(mFile, 0, SEEK_SET); + } + + // Parse - Parse the data + // recordsRequested - number of records requested + // labels - pointer to vector to return the labels + // numbers - pointer to vector to return the numbers + // seqPos - pointers to the other two arrays showing positions of each sequence + // returns - number of records actually read, if the end of file is reached the return value will be < requested records + long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos) + { + assert(numbers != NULL || m_dimFeatures == 0 || m_parseMode == ParseLineCount); + assert(labels != NULL || m_dimLabelsIn == 0 && m_dimLabelsOut == 0|| m_parseMode == ParseLineCount); + + // transfer to member variables + m_numbers = numbers; + m_labels = labels; + + long TickStart = GetTickCount( ); + long recordCount = 0; + long orgRecordCount = (long)labels->size(); + long lineCount = 0; + SequencePosition sequencePositionLast(0,0,seqFlagNull); + /// get line + char ch2[MAXSTRING]; + while (recordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr) + { + + string ch = ch2; + std::vector vstr; + vstr = sep_string(ch, " "); + if (vstr.size() < 3) + continue; + + for (size_t i = 0; i < vstr.size(); i++) + { + labels->push_back(vstr[i]); + } + SequencePosition sequencePos(numbers->size(), labels->size(), + m_beginSequence?seqFlagStartLabel:0 | m_endSequence?seqFlagStopLabel:0 | seqFlagLineBreak); + // add a sequence element to the list + seqPos->push_back(sequencePos); + sequencePositionLast = sequencePos; + + recordCount = (long)labels->size() - orgRecordCount; + + lineCount ++; + } // while + + long TickStop = GetTickCount( ); + + long TickDelta = TickStop - TickStart; + + if (m_traceLevel > 2) + fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted ); + return lineCount; + } + +}; + +typedef struct{ + size_t sLen; + size_t sBegin; + size_t sEnd; +} stSentenceInfo; +/// language model sequence parser +template +class LMBatchSequenceParser: public LMSequenceParser +{ +public: + vector mSentenceIndex2SentenceInfo; + +public: + LMBatchSequenceParser() { }; + ~LMBatchSequenceParser() { } + + void ParseInit(LPCWSTR fileName, size_t dimFeatures, size_t dimLabelsIn, size_t dimLabelsOut, std::string beginSequenceIn="", std::string endSequenceIn="", std::string beginSequenceOut="O", std::string endSequenceOut="O"); + + // Parse - Parse the data + // recordsRequested - number of records requested + // labels - pointer to vector to return the labels + // numbers - pointer to vector to return the numbers + // seqPos - pointers to the other two arrays showing positions of each sequence + // returns - number of records actually read, if the end of file is reached the return value will be < requested records + long Parse(size_t recordsRequested, std::vector *labels, std::vector *numbers, std::vector *seqPos); + +}; diff --git a/DataReader/SequenceReader/SequenceReader.cpp b/DataReader/SequenceReader/SequenceReader.cpp index a95d9ee6d..cec66fd52 100644 --- a/DataReader/SequenceReader/SequenceReader.cpp +++ b/DataReader/SequenceReader/SequenceReader.cpp @@ -1,2010 +1,2007 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// SequenceReader.cpp : Defines the exported functions for the DLL application. -// - - -#include "stdafx.h" -#define DATAREADER_EXPORTS // creating the exports here -#include "DataReader.h" -#include "SequenceReader.h" -#ifdef LEAKDETECT -#include // leak detection -#endif -#include "fileutil.h" // for fexists() - -namespace Microsoft { namespace MSR { namespace CNTK { - -// ReadLine - Read a line -// readSample - sample to read in global sample space -// returns - true if we successfully read a record, otherwise false -template -bool SequenceReader::ReadRecord(size_t /*readSample*/) -{ - return false; // not used -} - -// RecordsToRead - Determine number of records to read to populate record buffers -// mbStartSample - the starting sample from which to read -// tail - we are checking for possible remainer records to read (default false) -// returns - true if we have more to read, false if we hit the end of the dataset -template -size_t SequenceReader::RecordsToRead(size_t mbStartSample, bool tail) -{ - assert(mbStartSample >= m_epochStartSample); - // determine how far ahead we need to read - // need to read to the end of the next minibatch - size_t epochSample = mbStartSample; - epochSample %= m_epochSize; - - // determine number left to read for this epoch - size_t numberToEpoch = m_epochSize - epochSample; - // we will take either a minibatch or the number left in the epoch - size_t numberToRead = min(numberToEpoch, m_mbSize); - if (numberToRead == 0 && !tail) - numberToRead = m_mbSize; - - return numberToRead; -} - -// GetIdFromLabel - get an Id from a Label -// mbStartSample - the starting sample we are ensureing are good -// endOfDataCheck - check if we are at the end of the dataset (no wraparound) -// returns - true if we have more to read, false if we hit the end of the dataset -template -/*IDataReader::LabelIdType*/ unsigned SequenceReader::GetIdFromLabel(const std::string& labelValue, LabelInfo& labelInfo) -{ - auto found = labelInfo.mapLabelToId.find(labelValue); - - // not yet found, add to the map - if (found == labelInfo.mapLabelToId.end()) - { - labelInfo.mapLabelToId[labelValue] = labelInfo.idMax; - labelInfo.mapIdToLabel[labelInfo.idMax] = labelValue; - found = labelInfo.mapLabelToId.find(labelValue); - labelInfo.idMax++; - } - return found->second; -} - -template -/*IDataReader::LabelIdType*/ bool SequenceReader::CheckIdFromLabel(const std::string& labelValue, const LabelInfo& labelInfo, unsigned & labelId) -{ - auto found = labelInfo.mapLabelToId.find(labelValue); - - // not yet found, add to the map - if (found == labelInfo.mapLabelToId.end()) - { - return false; - } - labelId = found->second; - return true; -} - -// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested -// mbStartSample - the starting sample we are starting with -// endOfDataCheck - check if we are at the end of the dataset (no wraparound) -// returns - true if we have more to read, false if we hit the end of the dataset -template -bool SequenceReader::EnsureDataAvailable(size_t mbStartSample, bool /*endOfDataCheck*/) -{ - assert(mbStartSample >= m_epochStartSample); - // determine how far ahead we need to read - // need to read to the end of the next minibatch - size_t epochSample = mbStartSample; - bool moreToRead = true; - - size_t numberToRead = RecordsToRead(mbStartSample); - - // check to see if we have the proper records read already - if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample) - return true; - - // if we have another sequence already read and waiting, just return now - if (m_seqIndex < m_sequence.size()) - return true; - - m_seqIndex = 0; - m_mbStartSample = 0; - m_sequence.clear(); - m_featureData.clear(); - m_labelIdData.clear(); - - m_readNextSample = 0; - epochSample = 0; - - // now get the labels - LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - - bool nextWord = false; - if (m_labelInfo[labelInfoOut].type == labelNextWord) - { - nextWord = true; - } - LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut]; - - //if (m_labelIdData.size() > epochSample) - //{ - // m_labelIdData.resize(epochSample); - // m_labelData.resize(epochSample*labelInfo.dim); - //} - - // see how many we already read - int sequencesRead = 0; - std::vector featureTemp; - std::vector labelTemp; - std::vector seqPos; - do - { - int numRead = m_parser.Parse(CACHE_BLOG_SIZE, &labelTemp, &featureTemp, &seqPos); - moreToRead = (numRead != 0); - - // translate from the sparse parsed data format to the to the training format data - int label = 0; - bool bSentenceStart = false; - SequencePosition sposLast = SequencePosition(0,0,seqFlagNull); - for (int seq = 0; seq < numRead; seq++) - { - // check - SequencePosition spos = seqPos[seq]; - if (spos.labelPos == sposLast.labelPos && spos.numberPos == sposLast.numberPos) - continue; - sposLast = spos; - - bSentenceStart = true; - - // loop through the labels for this entry - while (label < spos.labelPos) /// need to minus one since - { - - // labelIn should be a category label - LabelType labelValue = labelTemp[label++]; - - if (trim(labelValue).size() == 0) - continue; // empty input - - // check for end of sequence marker - if (!bSentenceStart && (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()) || ((label - 1 )% m_mbSize == 0) )) - { - // ignore those cases where $ is put in the begining, because those are used for initialization purpose - spos.flags |= seqFlagStopLabel; - sequencesRead++; - - // create the seqence table - m_sequence.push_back(epochSample); - if ((m_sequence.size() == 1 ? epochSample : epochSample - m_sequence[m_sequence.size()-2]) > m_mbSize) - { - fprintf(stderr, "read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample); - RuntimeError("read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample); - } - - if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) - continue; /// ignore sentence ending - } - - // to-do, should ignore , check the sentence ending is - // need to remove from the training set - // allocate and initialize the next chunck of featureData - if (labelIn.type == labelCategory) - { - LabelIdType index = GetIdFromLabel(labelValue, labelIn); - - // use the found value, and set the appropriate location to a 1.0 - assert(labelIn.dim > index); // if this goes off labelOut dimension is too small - m_featureData.push_back((float)index); - } - else - { - RuntimeError("Input label expected to be a category label"); - } - - // if we have potential features - if (m_featureDim > 0) - { - RuntimeError("to-do. Assume sparse input feature. need to change the code from dense matrix"); - // move the position up to the start of the additional features section -/* pos += labelIn.dim; - assert(pos + m_featureDim == m_featureData.size()); - // this has to be an even number, a pair of index and value - if ((spos.numberPos&1) != 0) - RuntimeError("Features must be specified in pairs (index:value). Invalid features for label '%s'\n", labelValue); - - while (feature < spos.numberPos) - { - int index = (int)featureTemp[feature++]; - if (index < 0 || index >= m_featureDim) - RuntimeError("Invalid feature index: %d for label '%s', feature max dimension = %lld\n", index, labelValue, m_featureDim); - - ElemType value = featureTemp[feature++]; - m_featureData[pos+index] = value; - } - */ - } - - // now get the output label - if (m_labelInfo[labelInfoOut].type == labelCategory) - { - labelValue = labelTemp[label++]; - } - else if (nextWord) - { - // this is the next word (label was incremented above) - labelValue = labelTemp[label]; - if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) - { - labelValue = labelInfo.endSequence; - } - } - else - { - RuntimeError("Invalid output label type, expected Category, or Next Word"); - } - - // get the ID from the label - LabelIdType id = GetIdFromLabel(labelValue, labelInfo); - m_labelIdData.push_back(id); - - m_readNextSample++; - epochSample++; - if (!m_endReached) - m_totalSamples++; // add to the total number of records in the dataset - - bSentenceStart = false; - } - - { - // check if the reading is right - int jEnd = (int) m_labelIdData.size() - 1; - LabelIdType index ; - if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false) - RuntimeError("cannot find sentence begining label"); - - if (m_labelIdData[jEnd] != index ) - /// for language model, the first word/letter has to be - RuntimeError("SequenceReader: the last letter/word of a batch has to be the sentence ending symbol"); - } - - } - - m_readNextSampleLine += numRead; - } - while (sequencesRead < 1 && moreToRead); // we need to read at least one sequence or have no more data - - // if we read to the end, update appropriate variables - if (!moreToRead) - { - UpdateDataVariables(); - } - - // if there more to read - return moreToRead; -} - -// UpdateDataVariables - Update variables that depend on the dataset being completely read -template -void SequenceReader::UpdateDataVariables() -{ - // if we haven't been all the way through the file yet - if (!m_endReached) - { - // get the size of the dataset - assert(m_totalSamples*m_featureCount >= m_featureData.size()); - - // if they want us to determine epoch size based on dataset size, do that - if (m_epochSize == requestDataSize) - { - m_epochSize = m_totalSamples; - } - - WriteLabelFile(); - - // we got to the end of the dataset - m_endReached = true; - } - - // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read) - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - if (m_labelInfo[index].type == labelCategory && m_labelInfo[index].idMax > m_labelInfo[index].dim) - m_labelInfo[index].dim = m_labelInfo[index].idMax; // update the label dimensions if different - } -} - -template -void SequenceReader::WriteLabelFile() -{ - // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read) - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - LabelInfo& labelInfo = m_labelInfo[index]; - - // write out the label file if they don't have one - if (!labelInfo.fileToWrite.empty()) - { - if (labelInfo.mapIdToLabel.size() > 0) - { - File labelFile(labelInfo.fileToWrite, fileOptionsWrite | fileOptionsText); - for (int i=0; i < labelInfo.mapIdToLabel.size(); ++i) - { - labelFile << labelInfo.mapIdToLabel[i] << '\n'; - } - labelInfo.fileToWrite.clear(); - } - else if (!m_cachingWriter) - { - fprintf(stderr, "WARNING: file %ws NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str()); - } - } - } -} - -template -void SequenceReader::LoadLabelFile(const std::wstring &filePath, std::vector& retLabels) -{ - File file(filePath, fileOptionsRead); - - // initialize with file name - std::string path = msra::strfun::utf8(filePath); - auto location = path.find_last_of("/\\"); - if (location != npos) - path = path.substr(location+1); - - // read the entire file into a string - string str; - retLabels.resize(0); - while (!file.IsEOF()) - { - file.GetLine(str); - - // check for a comment line - string::size_type pos = str.find_first_not_of(" \t"); - if (pos != -1) - { - retLabels.push_back((LabelType)trim(str)); - } - } -} - - -// Destroy - cleanup and remove this class -// NOTE: this destroys the object, and it can't be used past this point -template -void SequenceReader::Destroy() -{ - delete this; -} - -// Init - Reader Initialize for multiple data sets -// config - [in] configuration parameters for the datareader -// Sample format below: -//# Parameter values for the reader -//reader=[ -// # reader to use -// readerType=SequenceReader -// randomize=None -// # additional features dimension -// featureDim=784 -// file=c:\data\sequence\sequence.txt -// labelIn=[ -// dim=26 -// labelMappingFile=c:\data\sequence\alphabet.txt -// labelType=Category -// beginSequence="" -// endSequence="" -// ] -// labelOut=[ -// dim=129 -// labelMappingFile=c:\data\sequence\phonemes.txt -// labelType=Category -// beginSequence="O" -// endSequence="O" -// ] -//] -template -void SequenceReader::Init(const ConfigParameters& readerConfig) -{ - // See if the user wants caching - m_cachingReader = NULL; - m_cachingWriter = NULL; - - // NOTE: probably want to re-enable at some point - - // initialize the cache - //InitCache(readerConfig); - //m_readerConfig = readerConfig; - - //// if we have a cache, no need to parse the test files... - //if (m_cachingReader) - // return; - - std::vector features; - std::vector labels; - GetFileConfigNames(readerConfig, features, labels); - if (features.size() > 0) - { - m_featuresName = features[0]; - } - - if (labels.size() == 2) - { - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - m_labelsName[index] = labels[index]; - } - } - else - RuntimeError("two label definitions (in and out) required for Sequence Reader"); - - ConfigParameters featureConfig = readerConfig(m_featuresName,""); - ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")}; - - class_size = 0; - m_featureDim = featureConfig("dim"); - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - m_labelInfo[index].idMax = 0; - m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", ""); - m_labelInfo[index].endSequence = labelConfig[index]("endSequence", ""); - - // determine label type desired - std::string labelType(labelConfig[index]("labelType","Category")); - if (labelType == "Category") - { - m_labelInfo[index].type = labelCategory; - } - else if (labelType == "NextWord") - { - // in this case, it's all identical to the Input labels, except the data type - m_labelInfo[index].type = labelNextWord; - m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim; - } - else if (labelType == "None") - { - m_labelInfo[index].type = labelNone; - m_labelInfo[index].dim = 0; // override for no labels - } - - // if we have labels, we need a label Mapping file, it will be a file with one label per line - if (m_labelInfo[index].type != labelNone) - { - std::wstring wClassFile = readerConfig("wordclass", ""); - nwords = labelConfig[index]("labelDim"); - if (wClassFile != L""){ - ReadClassInfo(wClassFile , false); - } - - std::vector arrayLabels; - std::wstring labelPath = labelConfig[index]("labelMappingFile"); - if (fexists(labelPath)) - { - LoadLabelFile(labelPath, arrayLabels); - for (int i=0; i < arrayLabels.size(); ++i) - { - LabelType label = arrayLabels[i]; - m_labelInfo[index].mapIdToLabel[i] = label; - m_labelInfo[index].mapLabelToId[label] = i; - } - m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size(); - m_labelInfo[index].mapName = labelPath; - } - else - { - if (wClassFile != L""){ - ReadClassInfo(wClassFile , false); - int iMax = -1, i; - for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++) - { - LabelType label = ptr->first; - i = ptr->second; - iMax = max(i, iMax); - m_labelInfo[index].mapIdToLabel[i] = label; - m_labelInfo[index].mapLabelToId[label] = i; - } - m_labelInfo[index].idMax = (LabelIdType)(iMax+1); - - OrganizeClass(); - - } - m_labelInfo[index].mapName = labelPath; - - m_labelInfo[index].fileToWrite = labelPath; - } - } - - m_labelInfo[index].dim = labelConfig[index]("labelDim"); - - // update dimension if the file says it's bigger - if (m_labelInfo[index].dim < m_labelInfo[index].idMax) - { - m_labelInfo[index].dim = m_labelInfo[index].idMax; - } - } - - // initialize all the variables - m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0; - m_endReached = false; - m_readNextSampleLine = 0; - m_readNextSample = 0; - m_traceLevel = readerConfig("traceLevel","0"); - m_parser.SetTraceLevel(m_traceLevel); - - if (readerConfig.Exists("randomize")) - { - string randomizeString = readerConfig("randomize"); - if (randomizeString == "None") - { - ; - } - else if (randomizeString == "Auto") - { - ; - } - else - { - ;//readerConfig("randomize"); - } - } - else - { - ; //randomizeAuto; - } - - // The input data is a combination of the label Data and extra feature dims together -// m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim; - m_featureCount = 1; - - std::wstring m_file = readerConfig("file"); - if (m_traceLevel > 0) - fprintf(stderr, "reading sequence file %ws\n", m_file.c_str()); - - const LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - const LabelInfo& labelOut = m_labelInfo[labelInfoOut]; - m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence); -} - -template -void SequenceReader::ReadWord(char *word, FILE *fin) -{ - int a=0, ch; - - while (!feof(fin)) { - ch=fgetc(fin); - - if (ch==13) continue; - - if ((ch==' ') || (ch=='\t') || (ch=='\n')) { - if (a>0) { - if (ch=='\n') ungetc(ch, fin); - break; - } - - if (ch=='\n') { - strcpy_s(word, strlen(""), (char *)""); - return; - } - else continue; - } - - word[a]=(char)ch; - a++; - - if (a>=MAX_STRING) { - //printf("Too long word found!\n"); //truncate too long words - a--; - } - } - word[a]=0; -} - -template -void SequenceReader::ReadClassInfo(const wstring & vocfile, bool /*flatten*/) -{ - char strFileName[MAX_STRING]; - char stmp[MAX_STRING]; - string strtmp; - size_t sz; - int cnt, clsidx, b; - class_size = 0; - - wcstombs_s(&sz, strFileName, 2048, vocfile.c_str(), vocfile.length()); - - FILE * vin; - vin = fopen(strFileName, "rt") ; - - if (vin == nullptr) - { - RuntimeError("cannot open word class file"); - } - for (int a = 0; a < nwords; a++) - { - fscanf_s(vin, "%6d\t%10d\t", &b, &cnt); - ReadWord(stmp, vin); - fscanf_s(vin, "%d\t\n", &clsidx); - strtmp = stmp; - idx4cnt[b] = cnt; - word4idx[strtmp] = b; - idx4word[b]= strtmp; - - idx4class[b] = clsidx; - class_size = max(class_size, clsidx); - } - fclose(vin); - - class_size ++; -} - -// InitCache - Initialize the caching reader if cache files exist, otherwise the writer -// readerConfig - reader configuration -template -void SequenceReader::InitCache(const ConfigParameters& readerConfig) -{ - // check for a writer tag first (lets us know we are caching) - if (!readerConfig.Exists("writerType")) - return; - - // first try to open the binary cache - bool found = false; - try - { - // TODO: need to go down to all levels, maybe search for sectionType - ConfigArray filesList(','); - vector names; - if (readerConfig.Exists("wfile")) - { - filesList.push_back(readerConfig("wfile")); - if (fexists(readerConfig("wfile"))) - found = true; - } - FindConfigNames(readerConfig, "wfile", names); - for (auto name : names) - { - ConfigParameters config = readerConfig(name); - filesList.push_back(config("wfile")); - if (fexists(config("wfile"))) - found = true; - } - - // if we have a file already, we are going to read the cached files - if (found) - { - ConfigParameters config; - readerConfig.CopyTo(config); - // mmodify the config so the reader types look correct - config["readerType"] = config("writerType"); - config["file"] = filesList; - m_cachingReader = new DataReader(config); - } - else - { - m_cachingWriter = new DataWriter(readerConfig); - - // now get the section names for map and category types - std::map sections; - m_cachingWriter->GetSections(sections); - for (auto pair : sections) - { - // TODO: we would need to add a sequenceMap type here as well - // or maybe change to heirarchal name (i.e. root.labelIn.map) - if (pair.second == sectionTypeCategoryLabel) - { - m_labelsCategoryName[labelInfoOut] = pair.first; - } - else if (pair.second == sectionTypeLabelMapping) - { - m_labelsMapName[labelInfoOut] = pair.first; - } - } - } - } - catch (runtime_error err) - { - fprintf(stderr,"Error attemping to create Binary%s\n%s\n",found?"Reader":"Writer",err.what()); - delete m_cachingReader; - m_cachingReader = NULL; - delete m_cachingWriter; - m_cachingWriter = NULL; - } - catch (...) - { - // if there is any error, just get rid of the object - fprintf(stderr,"Error attemping to create Binary%s\n",found?"Reader":"Writer"); - delete m_cachingReader; - m_cachingReader = NULL; - delete m_cachingWriter; - m_cachingWriter = NULL; - } -} - -// destructor - virtual so it gets called properly -template -SequenceReader::~SequenceReader() -{ - ReleaseMemory(); - delete m_cachingReader; - delete m_cachingWriter; -} - -// ReleaseMemory - release the memory footprint of SequenceReader -// used when the caching reader is taking over -template -void SequenceReader::ReleaseMemory() -{ - if (m_featuresBuffer!=NULL) - delete[] m_featuresBuffer; - m_featuresBuffer=NULL; - if (m_labelsBuffer!=NULL) - delete[] m_labelsBuffer; - m_labelsBuffer=NULL; - if (m_labelsIdBuffer!=NULL) - delete[] m_labelsIdBuffer; - m_labelsIdBuffer=NULL; - m_featureData.clear(); - m_labelIdData.clear(); - m_labelData.clear(); - m_sequence.clear(); -} - -//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch -template -void SequenceReader::SetupEpoch() -{ - // if we are starting fresh (epoch zero and no data read), init everything - // however if we are using cachingWriter, we need to know record count, so do that first - if (m_epoch == 0 && m_totalSamples == 0 && m_cachingWriter == NULL) - { - m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0; - m_parser.SetFilePosition(0); - } - else // otherwise, position the read to start at the right location - { - m_seqIndex = 0; - // don't know the total number of samples yet, so count them - if (m_totalSamples == 0) - { - if (m_traceLevel > 0) - fprintf(stderr, "starting at epoch %d parsing all data to determine record count\n", m_epoch); - // choose a large number to read - m_parser.SetFilePosition(0); - m_mbStartSample = 0; - while (EnsureDataAvailable(m_mbStartSample)) - { - m_mbStartSample = m_totalSamples; - m_seqIndex = m_sequence.size(); - } - if (m_traceLevel > 0) - fprintf(stderr, "\n %lld records found\n", m_totalSamples); - } - m_seqIndex = 0; - - // we have a slight delima here, if we haven't determined the end of the file yet - // and the user told us to find how many records are in the file, we can't distinguish "almost done" - // with a file (a character away) and the middle of the file. So read ahead a record to see if it's there. - bool endReached = m_endReached; - if (!endReached) - { - if (!m_parser.HasMoreData()) - { - endReached = true; - UpdateDataVariables(); - assert(m_endReached); - } - } - - // always start from the first sample - m_epochStartSample = m_mbStartSample = 0; - } -} - -template -void SequenceReader::LMSetupEpoch() -{ - m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0; -} - -// utility function to round an integer up to a multiple of size -size_t RoundUp(size_t value, size_t size) -{ - return ((value + size -1)/size)*size; -} - -//StartMinibatchLoop - Startup a minibatch loop -// mbSize - [in] size of the minibatch (number of Samples, etc.) -// NOTE: for sequence data, this will be the MAX size of a sequence, as every sequence could be a different length -// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run) -// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset -template -void SequenceReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) -{ - // if we aren't currently caching, see if we can use a cache - if (!m_cachingReader && !m_cachingWriter) - { - InitCache(m_readerConfig); - if (m_cachingReader) - ReleaseMemory(); // free the memory used by the SequenceReader - } - - // if we are reading from the cache, do so now and return - if (m_cachingReader) - { - m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples); - return; - } - - if (m_featuresBuffer==NULL) - { - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - m_featuresBuffer = new ElemType[mbSize*labelInfo.dim]; - memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim); - } - - if (m_labelsBuffer==NULL) - { - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - if (labelInfo.type == labelCategory) - { - m_labelsBuffer = new ElemType[labelInfo.dim*mbSize]; - memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize); - m_labelsIdBuffer = new IDataReader::LabelIdType[mbSize]; - memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*mbSize); - } - else if (labelInfo.type != labelNone) - { - m_labelsBuffer = new ElemType[mbSize]; - memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize); - m_labelsIdBuffer = NULL; - } - } - - m_mbSize = mbSize; - if (requestedEpochSamples == requestDataSize) - { - if (!m_endReached) - { - m_epochSize = requestDataSize; - } - } - else - { - m_epochSize = requestedEpochSamples; - } - - // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set - size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize; - m_epoch = epoch; - m_mbStartSample = epoch*m_epochSize; - - // allocate room for the data - m_featureData.reserve(m_featureCount*epochSize); - if (m_labelInfo[labelInfoOut].type == labelCategory) - m_labelIdData.reserve(epochSize); - else if (m_labelInfo[labelInfoOut].type != labelNone) - m_labelData.reserve(epochSize); - m_sequence.reserve(m_seqIndex); // clear out the sequence array - /// this is too complicated for LM - // SetupEpoch(); - /// use the LMSetupEpoch() instead - LMSetupEpoch(); - - m_clsinfoRead = false; - m_idx2clsRead = false; - - m_parser.ParseReset(); -} - -template -bool SequenceReader::DataEnd(EndDataType endDataType) -{ - bool ret = false; - switch (endDataType) - { - case endDataNull: - assert(false); - break; - case endDataEpoch: - ret = m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1]; - break; - case endDataSet: - ret = !EnsureDataAvailable(m_mbStartSample); - break; - case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true - ret = SentenceEnd(); - break; - } - return ret; -} - - -template -bool SequenceReader::SentenceEnd() -{ - // this is after getMinibatch size, which has increased m_seqIndex by 1 - // so the real index is m_seqIndex - 1; - int seqIndex = (int)m_seqIndex - 1; - - // now get the labels - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - - size_t actualmbsize = 0; - - // figure out the size of the next sequence - if (seqIndex > 0) - { - actualmbsize = m_sequence[seqIndex] - m_sequence[seqIndex-1]; - } - else - { - actualmbsize = m_sequence[0]; - } - - if (actualmbsize < m_mbSize) - return true; - - size_t jEnd = m_sequence[seqIndex]-1; - - if (labelInfo.type == labelCategory) - { - LabelIdType index ; - if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false) - RuntimeError("cannot find sentence begining label"); - - if (m_labelIdData[jEnd] == index ) - return true; - else - return false; - } - return false; -} - -template -void SequenceReader::GetLabelOutput(std::map*>& matrices, - size_t m_mbStartSample, size_t actualmbsize) -{ - size_t j = 0; - Matrix* labels = matrices[m_labelsName[labelInfoOut]]; - if (labels == nullptr) return; - - labels->Resize(nwords + class_size, actualmbsize, false); - - for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) - { - // pick the right sample with randomization if desired - size_t jRand = jSample; - - int wrd = m_labelIdData[jRand]; - int clsidx = idx4class[wrd]; - - labels->SetValue(wrd, j, 1); - - if (class_size > 0) - labels->SetValue(nwords + clsidx, j, 1); - } - -} - -template -void SequenceReader::GetInputToClass(std::map*>& matrices) -{ - Matrix* idx2cls= matrices[STRIDX2CLS]; - if (idx2cls== nullptr) return; - - if (m_idx2clsRead) return; - - // populate local CPU matrix - m_id2classLocal->SwitchToMatrixType(MatrixType::DENSE); - m_id2classLocal->Resize(nwords , 1, false); - - //move to CPU since element-wise operation is expensive and can go wrong in GPU - int curDevId = m_id2classLocal->GetDeviceId(); - m_id2classLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); - for (size_t j = 0; j < nwords ; j++) - { - int clsidx = idx4class[(int)j]; - (*m_id2classLocal)(j,0) = (float)clsidx; - } - m_id2classLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); - - int oldDeviceId = idx2cls->GetDeviceId(); - // caution, SetValue changes idx2cls from GPU to CPU, may change this behavior later - idx2cls->SetValue(*m_id2classLocal); - idx2cls->TransferFromDeviceToDevice(idx2cls->GetDeviceId(), oldDeviceId, true); - - m_idx2clsRead = true; -} - -template -void SequenceReader::GetClassInfo(std::map*>& matrices) -{ - Matrix* clsinfo = matrices[CLASSINFO]; - if (clsinfo == nullptr) return; - - if (m_clsinfoRead) return; - - // populate local CPU matrix - m_classInfoLocal->SwitchToMatrixType(MatrixType::DENSE); - m_classInfoLocal->Resize(2, class_size); - - //move to CPU since element-wise operation is expensive and can go wrong in GPU - int curDevId = m_classInfoLocal->GetDeviceId(); - m_classInfoLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); - - int clsidx; - int prvcls = -1; - for (size_t j = 0; j < nwords; j++) - { - clsidx = idx4class[(int)j]; - if (prvcls != clsidx) - { - if (prvcls >= 0) - (*m_classInfoLocal)(1, prvcls) = (float)j; - prvcls = clsidx; - (*m_classInfoLocal)(0, prvcls) = (float)j; - } - } - (*m_classInfoLocal)(1, prvcls) = (float)nwords; - - m_classInfoLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); - - int oldDeviceId = clsinfo->GetDeviceId(); - // caution, SetValue changes m_classInfoLocal from GPU to CPU, may change this behavior later - clsinfo->SetValue(*m_classInfoLocal); - clsinfo->TransferFromDeviceToDevice(clsinfo->GetDeviceId(), oldDeviceId, true); - - m_clsinfoRead = true; -} - -template -bool SequenceReader::GetMinibatch(std::map*>& matrices) -{ - - // get out if they didn't call StartMinibatchLoop() first - if (m_mbSize == 0) - return false; - - // check to see if we have changed epochs, if so we are done with this one. - if (m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1]) - return false; - - bool moreData = EnsureDataAvailable(m_mbStartSample); - if (moreData == false) - return false; - - // figure which sweep of the randomization we are on - size_t recordStart = m_totalSamples?m_mbStartSample%m_totalSamples:m_mbStartSample; - - // actual size is the size of the next seqence - size_t actualmbsize = 0; - - // figure out the size of the next sequence - if (m_seqIndex > 0 && m_seqIndex < m_sequence.size() && m_sequence.size() > 1) - { - actualmbsize = m_sequence[m_seqIndex] - m_sequence[m_seqIndex-1]; - } - else - { - actualmbsize = m_sequence[0]; - } - - if (actualmbsize > m_mbSize){ - RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize); - } - - // hit the end of the dataset, - if (!moreData) - { - // make sure we take into account hitting the end of the dataset (not wrapping around) - actualmbsize = min(m_totalSamples-recordStart,actualmbsize); - } - - // now get the labels - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - - if (labelInfo.type == labelCategory) - { - memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*actualmbsize); - memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*actualmbsize); - } - else if (labelInfo.type != labelNone) - { - memset(m_labelsBuffer,0,sizeof(ElemType)*1*actualmbsize); - } - - if (actualmbsize > 0) - { - - memset(m_featuresBuffer, 0, sizeof(ElemType)*actualmbsize*labelInfo.dim); - - //loop through all the samples - int j = 0; - Matrix& features = *matrices[m_featuresName]; - if (matrices.find(m_featuresName) != matrices.end()) - { - if(features.GetMatrixType() == MatrixType::DENSE) - { - features.Resize(labelInfo.dim, actualmbsize, false); - features.SetValue(0); - } - else - { - features.Resize(labelInfo.dim, actualmbsize); - features.Reset(); - } - } - - for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) - { - // pick the right sample with randomization if desired - size_t jRand = jSample; - - // vector of feature data goes into matrix column - size_t idx = (size_t)m_featureData[jRand]; - m_featuresBuffer[j*labelInfo.dim + idx] = (ElemType)1; - - if (matrices.find(m_featuresName) != matrices.end()) - features.SetValue(idx, j, (ElemType)1); - } - - GetLabelOutput(matrices, m_mbStartSample, actualmbsize); - GetInputToClass(matrices); - GetClassInfo(matrices); - - // make sure that the sequence index matches our end index - assert(m_sequence[m_seqIndex] == m_mbStartSample+actualmbsize); - // go to the next sequence - m_seqIndex++; - } - - // advance to the next minibatch - m_mbStartSample += actualmbsize; - - // if they don't want partial minibatches, skip data transfer and return - if (actualmbsize == 0) // no records found (end of minibatch) - { - return false; - } - - // now transfer to the GPU as needed - try{ - // get the features array - if (matrices.find(m_featuresName) == matrices.end()) - { - Matrix& nbs = *matrices[L"numberobs"]; - int curDevId = nbs.GetDeviceId(); - nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); - nbs(0,0) = (float)actualmbsize; - nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); - for (size_t i = 0; i < actualmbsize; i++) - { - std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i); - Matrix& features = *matrices[ws]; - features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal); - } - } - }catch(...) - { - RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition" ,actualmbsize); - } - - try - { - if (labelInfo.type == labelCategory) - { - if (matrices.find(m_labelsName[labelInfoOut]) == matrices.end()) - { - for (size_t i = 0; i < actualmbsize; i++) - { - std::wstring ws = msra::strfun::wstrprintf (L"label%d", i); - Matrix* labels = matrices[ws]; - labels->SetValue(labelInfo.dim, 1, &m_labelsBuffer[i * labelInfo.dim],matrixFlagNormal); - } - } - } - else if (labelInfo.type != labelNone) - { - Matrix* labels = matrices[m_labelsName[labelInfoOut]]; - labels->SetValue(1, actualmbsize,m_labelsBuffer,matrixFlagNormal); - } - }catch(...) - { - RuntimeError("cannot find matrices for %s", m_labelsName[labelInfoOut]); - } - - // we read some records, so process them - return true; -} - -template -void SequenceReader::OrganizeClass() -{ - //allocate auxiliary class variables (for faster search when normalizing probability at output layer) - int cl, i; - for (i=0; i -const std::map::LabelIdType, typename IDataReader::LabelType>& SequenceReader::GetLabelMapping(const std::wstring& sectionName) -{ - if (m_cachingReader) - { - return m_cachingReader->GetLabelMapping(sectionName); - } - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - - return labelInfo.mapIdToLabel; -} - -// SetLabelMapping - Sets the label mapping from integer index to label -// labelMapping - mapping table from label values to IDs (must be 0-n) -// note: for tasks with labels, the mapping table must be the same between a training run and a testing run -template -void SequenceReader::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map::LabelIdType, typename LabelType>& labelMapping) -{ - if (m_cachingReader) - { - RuntimeError("Cannot set mapping table when the caching reader is being used"); - } - LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - - labelInfo.mapIdToLabel = labelMapping; - labelInfo.mapLabelToId.clear(); - for (std::pair var : labelMapping) - { - labelInfo.mapLabelToId[var.second] = var.first; - } -} - -// GetData - Gets metadata from the specified section (into CPU memory) -// sectionName - section name to retrieve data from -// numRecords - number of records to read -// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request -// dataBufferSize - [in] size of the databuffer in bytes -// [out] size of buffer filled with data -// recordStart - record to start reading from, defaults to zero (start of data) -// returns: true if data remains to be read, false if the end of data was reached -template -bool SequenceReader::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart) -{ - if (!m_cachingReader) - RuntimeError("GetData not supported in SequenceReader"); - return m_cachingReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart); -} - -// instantiate all the combinations we expect to be used -template class SequenceReader; -template class SequenceReader; - -template -void BatchSequenceReader::Init(const ConfigParameters& readerConfig) -{ - // See if the user wants caching - m_cachingReader = NULL; - m_cachingWriter = NULL; - - // NOTE: probably want to re-enable at some point - - // initialize the cache - //InitCache(readerConfig); - //m_readerConfig = readerConfig; - - //// if we have a cache, no need to parse the test files... - //if (m_cachingReader) - // return; - - std::vector features; - std::vector labels; - GetFileConfigNames(readerConfig, features, labels); - if (features.size() > 0) - { - m_featuresName = features[0]; - } - - if (labels.size() == 2) - { - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - m_labelsName[index] = labels[index]; - } - } - else - RuntimeError("two label definitions (in and out) required for Sequence Reader"); - - ConfigParameters featureConfig = readerConfig(m_featuresName,""); - ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")}; - - class_size = 0; - m_featureDim = featureConfig("dim"); - for (int index = labelInfoMin; index < labelInfoMax; ++index) - { - m_labelInfo[index].idMax = 0; - m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", ""); - m_labelInfo[index].endSequence = labelConfig[index]("endSequence", ""); - - // determine label type desired - std::string labelType(labelConfig[index]("labelType","Category")); - if (labelType == "Category") - { - m_labelInfo[index].type = labelCategory; - } - else if (labelType == "NextWord") - { - // in this case, it's all identical to the Input labels, except the data type - m_labelInfo[index].type = labelNextWord; - m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim; - } - else if (labelType == "None") - { - m_labelInfo[index].type = labelNone; - m_labelInfo[index].dim = 0; // override for no labels - } - - // if we have labels, we need a label Mapping file, it will be a file with one label per line - if (m_labelInfo[index].type != labelNone) - { - std::wstring wClassFile = readerConfig("wordclass", ""); - nwords = labelConfig[index]("labelDim"); - if (wClassFile != L""){ - ReadClassInfo(wClassFile , false); - } - - std::vector arrayLabels; - std::wstring labelPath = labelConfig[index]("labelMappingFile"); - if (fexists(labelPath)) - { - LoadLabelFile(labelPath, arrayLabels); - for (int i=0; i < arrayLabels.size(); ++i) - { - LabelType label = arrayLabels[i]; - m_labelInfo[index].mapIdToLabel[i] = label; - m_labelInfo[index].mapLabelToId[label] = i; - } - m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size(); - m_labelInfo[index].mapName = labelPath; - } - else - { - if (wClassFile != L""){ - ReadClassInfo(wClassFile , false); - int iMax = -1, i; - for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++) - { - LabelType label = ptr->first; - i = ptr->second; - iMax = max(i, iMax); - m_labelInfo[index].mapIdToLabel[i] = label; - m_labelInfo[index].mapLabelToId[label] = i; - } - m_labelInfo[index].idMax = (LabelIdType)(iMax+1); - - OrganizeClass(); - - } - m_labelInfo[index].mapName = labelPath; - - m_labelInfo[index].fileToWrite = labelPath; - } - } - - m_labelInfo[index].dim = labelConfig[index]("labelDim"); - - // update dimension if the file says it's bigger - if (m_labelInfo[index].dim < m_labelInfo[index].idMax) - { - m_labelInfo[index].dim = m_labelInfo[index].idMax; - } - } - - // initialize all the variables - m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0; - m_endReached = false; - m_readNextSampleLine = 0; - m_readNextSample = 0; - m_traceLevel = readerConfig("traceLevel","0"); - m_parser.SetTraceLevel(m_traceLevel); - - if (readerConfig.Exists("randomize")) - { - string randomizeString = readerConfig("randomize"); - if (randomizeString == "None") - { - ; - } - else if (randomizeString == "Auto") - { - ; - } - else - { - ;//readerConfig("randomize"); - } - } - else - { - ; //randomizeAuto; - } - - // The input data is a combination of the label Data and extra feature dims together -// m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim; - m_featureCount = 1; - - std::wstring m_file = readerConfig("file"); - if (m_traceLevel > 0) - fprintf(stderr, "reading sequence file %ws\n", m_file.c_str()); - - const LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - const LabelInfo& labelOut = m_labelInfo[labelInfoOut]; - m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence); - - mBlgSize = readerConfig("nbruttsineachrecurrentiter", "1"); -} - -template -void BatchSequenceReader::Reset() -{ - mProcessed.clear(); - mToProcess.clear(); - mLastProcssedSentenceId = 0; - mPosInSentence = 0; - mLastPosInSentence = 0; - mNumRead = 0; - - if (m_labelTemp.size() > 0) - m_labelTemp.clear(); - if (m_featureTemp.size() > 0) - m_featureTemp.clear(); - m_parser.mSentenceIndex2SentenceInfo.clear(); -} - -template -void BatchSequenceReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) -{ - // if we aren't currently caching, see if we can use a cache - if (!m_cachingReader && !m_cachingWriter) - { - InitCache(m_readerConfig); - if (m_cachingReader) - ReleaseMemory(); // free the memory used by the SequenceReader - } - - // if we are reading from the cache, do so now and return - if (m_cachingReader) - { - m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples); - return; - } - - if (m_featuresBuffer==NULL) - { - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - m_featuresBuffer = new ElemType[mbSize*labelInfo.dim]; - memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim); - } - - if (m_labelsBuffer==NULL) - { - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - if (labelInfo.type == labelCategory) - { - m_labelsBuffer = new ElemType[labelInfo.dim*mbSize]; - memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize); - m_labelsIdBuffer = new IDataReader::LabelIdType[mbSize]; - memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*mbSize); - } - else if (labelInfo.type != labelNone) - { - m_labelsBuffer = new ElemType[mbSize]; - memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize); - m_labelsIdBuffer = NULL; - } - } - - m_featuresBufferRow = new size_t[mbSize]; - m_featuresBufferRowIdx = new size_t[mbSize]; - - m_labelsIdBufferRow = new CPUSPARSE_INDEX_TYPE[2 * mbSize]; - m_labelsBlock2Id = new size_t[2*mbSize]; - m_labelsBlock2UniqId = new size_t[2*mbSize]; - - m_id2classLocal = new Matrix(CPUDEVICE); - m_classInfoLocal = new Matrix(CPUDEVICE); - - m_mbSize = mbSize; - if (requestedEpochSamples == requestDataSize) - { - if (!m_endReached) - { - m_epochSize = requestDataSize; - } - } - else - { - m_epochSize = requestedEpochSamples; - } - - // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set - size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize; - m_epoch = epoch; - m_mbStartSample = epoch*m_epochSize; - - // allocate room for the data - m_featureData.reserve(m_featureCount*epochSize); - if (m_labelInfo[labelInfoOut].type == labelCategory) - m_labelIdData.reserve(epochSize); - else if (m_labelInfo[labelInfoOut].type != labelNone) - m_labelData.reserve(epochSize); - m_sequence.reserve(m_seqIndex); // clear out the sequence array - /// this is too complicated for LM - // SetupEpoch(); - /// use the LMSetupEpoch() instead - LMSetupEpoch(); - - m_clsinfoRead = false; - m_idx2clsRead = false; - - m_parser.ParseReset(); - - Reset(); -} - -template -size_t BatchSequenceReader::FindNextSentences(size_t numRead) -{ - size_t sln = 0; - - if (numRead == 0) return 0; - - if (mProcessed.size() == 0) - { - mProcessed.resize(numRead, false); - } - - if (mToProcess.size() > 0) - { - bool allDone = false; - for (int s = 0; s < mToProcess.size(); s++) - { - int mp = (int)mToProcess[s]; - if (mProcessed[mp]) - { - mLastProcssedSentenceId = mp; - mLastPosInSentence = 0; - allDone = true; - break; - } - } - if (allDone) - { - mToProcess.clear(); - } - } - - if (mToProcess.size() > 0) - { - sln = m_parser.mSentenceIndex2SentenceInfo[mToProcess[0]].sLen; - return sln; - } - - for (size_t seq = mLastProcssedSentenceId ; seq < numRead; seq++) - { - if (mProcessed[seq]) continue; - - if (sln == 0) - { - sln = m_parser.mSentenceIndex2SentenceInfo[seq].sLen; - } - if (sln == m_parser.mSentenceIndex2SentenceInfo[seq].sLen && - mProcessed[seq] == false && mToProcess.size() < mBlgSize) - mToProcess.push_back(seq); - - if (mToProcess.size() == mBlgSize) break; - } - - return sln; -} - -template -bool BatchSequenceReader::EnsureDataAvailable(size_t /*mbStartSample*/) -{ - bool bDataIsThere = true; - - m_featureData.clear(); - m_labelIdData.clear(); - - // now get the labels - LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - - bool nextWord = false; - if (m_labelInfo[labelInfoOut].type == labelNextWord) - { - nextWord = true; - } - LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut]; - - // see how many we already read - std::vector seqPos; - - size_t sLn = FindNextSentences(mNumRead); - if (sLn == 0) - { - Reset(); - - mNumRead = m_parser.Parse(CACHE_BLOG_SIZE, &m_labelTemp, &m_featureTemp, &seqPos); - if (mNumRead == 0) return false; - - std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end()); - - m_readNextSampleLine += mNumRead; - sLn = FindNextSentences(mNumRead); - } - - /// add one minibatch - size_t i = mLastPosInSentence; - size_t j = 0; - // exclude the last token since it is the last label to be predicted - for (i = mLastPosInSentence; j < m_mbSize && i < sLn-1; i++ , j++) - { - for (int k = 0; k < mToProcess.size(); k++) - { - size_t seq = mToProcess[k]; - size_t label = m_parser.mSentenceIndex2SentenceInfo[seq].sBegin + i; - - // labelIn should be a category label - LabelType labelValue = m_labelTemp[label++]; - - // to-do, should ignore , check the sentence ending is - // need to remove from the training set - // allocate and initialize the next chunck of featureData - if (labelIn.type == labelCategory) - { - LabelIdType index = GetIdFromLabel(labelValue, labelIn); - - // use the found value, and set the appropriate location to a 1.0 - assert(labelIn.dim > index); // if this goes off labelOut dimension is too small - m_featureData.push_back((float)index); - } - else - { - RuntimeError("Input label expected to be a category label"); - } - - // now get the output label - if (m_labelInfo[labelInfoOut].type == labelCategory) - { - labelValue = m_labelTemp[label++]; - } - else if (nextWord) - { - // this is the next word (label was incremented above) - labelValue = m_labelTemp[label]; - if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) - { - labelValue = labelInfo.endSequence; - } - } - else - { - RuntimeError("Invalid output label type, expected Category, or Next Word"); - } - - // get the ID from the label - LabelIdType id = GetIdFromLabel(labelValue, labelInfo); - m_labelIdData.push_back(id); - - m_totalSamples ++; - } - } - - mLastPosInSentence = i; - - return bDataIsThere; -} - -template -size_t BatchSequenceReader::NumberSlicesInEachRecurrentIter() -{ - size_t sz = mToProcess.size(); - return sz; -} - -template -void BatchSequenceReader::SetNbrSlicesEachRecurrentIter(const size_t mz) -{ - mBlgSize = mz; -} - -template -bool BatchSequenceReader::GetMinibatch(std::map*>& matrices) -{ - - // get out if they didn't call StartMinibatchLoop() first - if (m_mbSize == 0) - return false; - - bool moreData = EnsureDataAvailable(m_mbStartSample); - if (moreData == false) - return false; - - // actual size is the size of the next seqence - size_t actualmbsize = 0; - - // figure out the size of the next sequence - actualmbsize = m_labelIdData.size() ; - if (actualmbsize > m_mbSize * mToProcess.size()){ - RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize); - } - - // now get the labels - const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; - - if (actualmbsize > 0) - { - - //loop through all the samples - Matrix& features = *matrices[m_featuresName]; - - // copy m_featureData to matrix - // we always copy it to cpu first and then convert to gpu if gpu is desired. - DEVICEID_TYPE featureDeviceId = features.GetDeviceId(); - features.TransferFromDeviceToDevice(featureDeviceId, CPUDEVICE, false, true, false); - - if (features.GetMatrixType() == MatrixType::DENSE) - { - features.Resize(labelInfo.dim, actualmbsize); - features.SetValue(0); - } - else - { - features.Resize(labelInfo.dim, actualmbsize, actualmbsize); - features.Reset(); - } - - for (size_t j = 0; j < actualmbsize; ++j) - { - // vector of feature data goes into matrix column - size_t idx = (size_t)m_featureData[j]; - - //if (matrices.find(m_featuresName) != matrices.end()) - features.SetValue(idx, j, (ElemType)1); - } - - features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false); - - //else // for GPU - //{ - // if (matrices.find(m_featuresName) != matrices.end()) - // { - // m_indexer.clear(); - // size_t size = m_featureData.size(); - - // for(int i = 0; i < size; i++) - // { - // m_featuresBufferRow[i] = (size_t)m_featureData[i]; - // if(m_indexer.find(m_featuresBufferRow[i]) == m_indexer.end()) - // { - // m_indexer[m_featuresBufferRow[i]] = m_indexer.size(); - // } - // m_featuresBufferRowIdx[i] = m_indexer[m_featuresBufferRow[i]]; - // } - // features.SetMatrixFromCSCFormat(m_featuresBufferRow, m_featuresBufferRowIdx, size, m_indexer.size()); - // } - //} - - // TODO: move these two methods to startMiniBatchLoop() - GetInputToClass(matrices); - GetClassInfo(matrices); - GetLabelOutput(matrices, 0, actualmbsize); - - // go to the next sequence - m_seqIndex++; - } - else - return false; - - // now transfer to the GPU as needed - try{ - // get the features array - if (matrices.find(m_featuresName) == matrices.end()) - { - Matrix& nbs = *matrices[L"numberobs"]; - int curDevId = nbs.GetDeviceId(); - nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); - nbs(0,0) = (float)actualmbsize; - nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); - for (size_t i = 0; i < actualmbsize; i++) - { - std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i); - Matrix& features = *matrices[ws]; - features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal); - } - } - }catch(...) - { - RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition" ,actualmbsize); - } - - // we read some records, so process them - return true; -} - -template -void BatchSequenceReader::SetSentenceEnd(int wrd, int pos, int actualMbSize) -{ - // now get the labels - LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - LabelIdType index = GetIdFromLabel(labelIn.endSequence.c_str(), labelIn); - - if (pos == actualMbSize - 1) - { - if (wrd == (int)index) - mSentenceEnd = true; - else - mSentenceEnd = false; - } -} - -template -void BatchSequenceReader::SetSentenceBegin(int wrd, int pos, int /*actualMbSize*/) -{ - // now get the labels - LabelInfo& labelIn = m_labelInfo[labelInfoIn]; - LabelIdType index = GetIdFromLabel(labelIn.beginSequence.c_str(), labelIn); - - if (pos == 0) - { - if (wrd == (int)index) - mSentenceBegin = true; - else - mSentenceBegin = false; - } -} - -template -void BatchSequenceReader::SetSentenceEndInBatch(vector &sentenceEnd) -{ - sentenceEnd.resize(mToProcess.size()); - if (mSentenceBegin) - { - sentenceEnd.assign(mToProcess.size(), 0); - } - else - { - sentenceEnd.assign(mToProcess.size(), m_mbSize+2); - } -} - -template -bool BatchSequenceReader::DataEnd(EndDataType endDataType) -{ - bool ret = false; - switch (endDataType) - { - case endDataNull: - assert(false); - break; - case endDataEpoch: - case endDataSet: - ret = !EnsureDataAvailable(m_mbStartSample); - break; - case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true - if (mSentenceEnd) - { - for (auto ptr = mToProcess.begin(); ptr != mToProcess.end(); ptr++) - mProcessed[*ptr] = true; - } - ret = mSentenceEnd; - break; - } - return ret; - -} - -template -void BatchSequenceReader::GetLabelOutput(std::map*>& matrices, - size_t m_mbStartSample, size_t actualmbsize) -{ - size_t j = 0; - Matrix* labels = matrices[m_labelsName[labelInfoOut]]; - if (labels == nullptr) return; - - if(labels->GetMatrixType() == MatrixType::DENSE) - { - labels->Resize(nwords + class_size, actualmbsize, false); - labels->SetValue(0); - } - else - { - labels->Resize(nwords + class_size, actualmbsize, 2*actualmbsize); - labels->Reset(); - } - - if(labels->GetCurrentMatrixLocation() == CPU) { - for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) - { - // pick the right sample with randomization if desired - size_t jRand = jSample; - - int wrd = m_labelIdData[jRand]; - int clsidx = idx4class[wrd]; - - labels->SetValue(wrd, j, 1); - - SetSentenceEnd(wrd, j, actualmbsize); - SetSentenceBegin(wrd, j, actualmbsize); - - if (class_size > 0) - labels->SetValue(nwords + clsidx, j, 1); - } - } - else // GPU - { - m_indexer.clear(); - int p = 0; - int b = 0; - int nz = 0; - - for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) - { - // pick the right sample with randomization if desired - size_t jRand = jSample; - int wrd = m_labelIdData[jRand]; - int clsidx = idx4class[wrd]; - SetSentenceEnd(wrd, j, actualmbsize); - SetSentenceBegin(wrd, j, actualmbsize); - - int start[2]; - int end[2]; - int target[2]; - int blockId[2]; - - start[0] = (int)(*m_classInfoLocal)(0, clsidx); - end[0] = (int)(*m_classInfoLocal)(1, clsidx); - target[0] = wrd; - blockId[0] = clsidx; - start[1] = nwords; - end[1] = nwords + (int)(*m_classInfoLocal).GetNumCols(); - target[1] = nwords + clsidx; - blockId[1] = -1; - - for(int i = 0; i < 2; i++) - { - m_labelsIdBufferRow[p] = target[i]; - int len = end[i] - start[i]; - - if(m_indexer.find(blockId[i]) == m_indexer.end()) - { - m_indexer[blockId[i]] = b; - b += len; - } - m_labelsBlock2Id[p] = nz; - m_labelsBlock2UniqId[p] = m_indexer[blockId[i]]; - nz += len; - p++; - } - } - - labels->SetMatrixFromLabelAndClass(m_labelsIdBufferRow, m_labelsBlock2Id, m_labelsBlock2UniqId, 2*actualmbsize, nz, b); - } -} - -template class BatchSequenceReader; -template class BatchSequenceReader; - +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// SequenceReader.cpp : Defines the exported functions for the DLL application. +// + + +#include "stdafx.h" +#define DATAREADER_EXPORTS // creating the exports here +#include "DataReader.h" +#include "SequenceReader.h" +#ifdef LEAKDETECT +#include // leak detection +#endif +#include "fileutil.h" // for fexists() + +namespace Microsoft { namespace MSR { namespace CNTK { + +// ReadLine - Read a line +// readSample - sample to read in global sample space +// returns - true if we successfully read a record, otherwise false +template +bool SequenceReader::ReadRecord(size_t /*readSample*/) +{ + return false; // not used +} + +// RecordsToRead - Determine number of records to read to populate record buffers +// mbStartSample - the starting sample from which to read +// tail - we are checking for possible remainer records to read (default false) +// returns - true if we have more to read, false if we hit the end of the dataset +template +size_t SequenceReader::RecordsToRead(size_t mbStartSample, bool tail) +{ + assert(mbStartSample >= m_epochStartSample); + // determine how far ahead we need to read + // need to read to the end of the next minibatch + size_t epochSample = mbStartSample; + epochSample %= m_epochSize; + + // determine number left to read for this epoch + size_t numberToEpoch = m_epochSize - epochSample; + // we will take either a minibatch or the number left in the epoch + size_t numberToRead = min(numberToEpoch, m_mbSize); + if (numberToRead == 0 && !tail) + numberToRead = m_mbSize; + + return numberToRead; +} + +// GetIdFromLabel - get an Id from a Label +// mbStartSample - the starting sample we are ensureing are good +// endOfDataCheck - check if we are at the end of the dataset (no wraparound) +// returns - true if we have more to read, false if we hit the end of the dataset +template +/*IDataReader::LabelIdType*/ unsigned SequenceReader::GetIdFromLabel(const std::string& labelValue, LabelInfo& labelInfo) +{ + auto found = labelInfo.mapLabelToId.find(labelValue); + + // not yet found, add to the map + if (found == labelInfo.mapLabelToId.end()) + { + RuntimeError("%s not in vocabulary", labelValue.c_str()); + } + return found->second; +} + +template +/*IDataReader::LabelIdType*/ bool SequenceReader::CheckIdFromLabel(const std::string& labelValue, const LabelInfo& labelInfo, unsigned & labelId) +{ + auto found = labelInfo.mapLabelToId.find(labelValue); + + // not yet found, add to the map + if (found == labelInfo.mapLabelToId.end()) + { + return false; + } + labelId = found->second; + return true; +} + +// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested +// mbStartSample - the starting sample we are starting with +// endOfDataCheck - check if we are at the end of the dataset (no wraparound) +// returns - true if we have more to read, false if we hit the end of the dataset +template +bool SequenceReader::EnsureDataAvailable(size_t mbStartSample, bool /*endOfDataCheck*/) +{ + assert(mbStartSample >= m_epochStartSample); + // determine how far ahead we need to read + // need to read to the end of the next minibatch + size_t epochSample = mbStartSample; + bool moreToRead = true; + + size_t numberToRead = RecordsToRead(mbStartSample); + + // check to see if we have the proper records read already + if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample) + return true; + + // if we have another sequence already read and waiting, just return now + if (m_seqIndex < m_sequence.size()) + return true; + + m_seqIndex = 0; + m_mbStartSample = 0; + m_sequence.clear(); + m_featureData.clear(); + m_labelIdData.clear(); + + m_readNextSample = 0; + epochSample = 0; + + // now get the labels + LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + + bool nextWord = false; + if (m_labelInfo[labelInfoOut].type == labelNextWord) + { + nextWord = true; + } + LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut]; + + //if (m_labelIdData.size() > epochSample) + //{ + // m_labelIdData.resize(epochSample); + // m_labelData.resize(epochSample*labelInfo.dim); + //} + + // see how many we already read + int sequencesRead = 0; + std::vector featureTemp; + std::vector labelTemp; + std::vector seqPos; + do + { + int numRead = m_parser.Parse(CACHE_BLOG_SIZE, &labelTemp, &featureTemp, &seqPos); + moreToRead = (numRead != 0); + + // translate from the sparse parsed data format to the to the training format data + int label = 0; + bool bSentenceStart = false; + SequencePosition sposLast = SequencePosition(0,0,seqFlagNull); + for (int seq = 0; seq < numRead; seq++) + { + // check + SequencePosition spos = seqPos[seq]; + if (spos.labelPos == sposLast.labelPos && spos.numberPos == sposLast.numberPos) + continue; + sposLast = spos; + + bSentenceStart = true; + + // loop through the labels for this entry + while (label < spos.labelPos) /// need to minus one since + { + + // labelIn should be a category label + LabelType labelValue = labelTemp[label++]; + + if (trim(labelValue).size() == 0) + continue; // empty input + + // check for end of sequence marker + if (!bSentenceStart && (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str()) || ((label - 1 )% m_mbSize == 0) )) + { + // ignore those cases where $ is put in the begining, because those are used for initialization purpose + spos.flags |= seqFlagStopLabel; + sequencesRead++; + + // create the seqence table + m_sequence.push_back(epochSample); + if ((m_sequence.size() == 1 ? epochSample : epochSample - m_sequence[m_sequence.size()-2]) > m_mbSize) + { + fprintf(stderr, "read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample); + RuntimeError("read sentence length is longer than the minibatch size. should be smaller. increase the minibatch size to at least %d", epochSample); + } + + if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) + continue; /// ignore sentence ending + } + + // to-do, should ignore , check the sentence ending is + // need to remove from the training set + // allocate and initialize the next chunck of featureData + if (labelIn.type == labelCategory) + { + LabelIdType index = GetIdFromLabel(labelValue, labelIn); + + // use the found value, and set the appropriate location to a 1.0 + assert(labelIn.dim > index); // if this goes off labelOut dimension is too small + m_featureData.push_back((float)index); + } + else + { + RuntimeError("Input label expected to be a category label"); + } + + // if we have potential features + if (m_featureDim > 0) + { + RuntimeError("to-do. Assume sparse input feature. need to change the code from dense matrix"); + // move the position up to the start of the additional features section +/* pos += labelIn.dim; + assert(pos + m_featureDim == m_featureData.size()); + // this has to be an even number, a pair of index and value + if ((spos.numberPos&1) != 0) + RuntimeError("Features must be specified in pairs (index:value). Invalid features for label '%s'\n", labelValue); + + while (feature < spos.numberPos) + { + int index = (int)featureTemp[feature++]; + if (index < 0 || index >= m_featureDim) + RuntimeError("Invalid feature index: %d for label '%s', feature max dimension = %lld\n", index, labelValue, m_featureDim); + + ElemType value = featureTemp[feature++]; + m_featureData[pos+index] = value; + } + */ + } + + // now get the output label + if (m_labelInfo[labelInfoOut].type == labelCategory) + { + labelValue = labelTemp[label++]; + } + else if (nextWord) + { + // this is the next word (label was incremented above) + labelValue = labelTemp[label]; + if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) + { + labelValue = labelInfo.endSequence; + } + } + else + { + RuntimeError("Invalid output label type, expected Category, or Next Word"); + } + + // get the ID from the label + LabelIdType id = GetIdFromLabel(labelValue, labelInfo); + m_labelIdData.push_back(id); + + m_readNextSample++; + epochSample++; + if (!m_endReached) + m_totalSamples++; // add to the total number of records in the dataset + + bSentenceStart = false; + } + + { + // check if the reading is right + int jEnd = (int) m_labelIdData.size() - 1; + LabelIdType index ; + if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false) + RuntimeError("cannot find sentence begining label"); + + if (m_labelIdData[jEnd] != index ) + /// for language model, the first word/letter has to be + RuntimeError("SequenceReader: the last letter/word of a batch has to be the sentence ending symbol"); + } + + } + + m_readNextSampleLine += numRead; + } + while (sequencesRead < 1 && moreToRead); // we need to read at least one sequence or have no more data + + // if we read to the end, update appropriate variables + if (!moreToRead) + { + UpdateDataVariables(); + } + + // if there more to read + return moreToRead; +} + +// UpdateDataVariables - Update variables that depend on the dataset being completely read +template +void SequenceReader::UpdateDataVariables() +{ + // if we haven't been all the way through the file yet + if (!m_endReached) + { + // get the size of the dataset + assert(m_totalSamples*m_featureCount >= m_featureData.size()); + + // if they want us to determine epoch size based on dataset size, do that + if (m_epochSize == requestDataSize) + { + m_epochSize = m_totalSamples; + } + + WriteLabelFile(); + + // we got to the end of the dataset + m_endReached = true; + } + + // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read) + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + if (m_labelInfo[index].type == labelCategory && m_labelInfo[index].idMax > m_labelInfo[index].dim) + m_labelInfo[index].dim = m_labelInfo[index].idMax; // update the label dimensions if different + } +} + +template +void SequenceReader::WriteLabelFile() +{ + // update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read) + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + LabelInfo& labelInfo = m_labelInfo[index]; + + // write out the label file if they don't have one + if (!labelInfo.fileToWrite.empty()) + { + if (labelInfo.mapIdToLabel.size() > 0) + { + File labelFile(labelInfo.fileToWrite, fileOptionsWrite | fileOptionsText); + for (int i=0; i < labelInfo.mapIdToLabel.size(); ++i) + { + labelFile << labelInfo.mapIdToLabel[i] << '\n'; + } + labelInfo.fileToWrite.clear(); + } + else if (!m_cachingWriter) + { + fprintf(stderr, "WARNING: file %ws NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str()); + } + } + } +} + +template +void SequenceReader::LoadLabelFile(const std::wstring &filePath, std::vector& retLabels) +{ + File file(filePath, fileOptionsRead); + + // initialize with file name + std::string path = msra::strfun::utf8(filePath); + auto location = path.find_last_of("/\\"); + if (location != npos) + path = path.substr(location+1); + + // read the entire file into a string + string str; + retLabels.resize(0); + while (!file.IsEOF()) + { + file.GetLine(str); + + // check for a comment line + string::size_type pos = str.find_first_not_of(" \t"); + if (pos != -1) + { + retLabels.push_back((LabelType)trim(str)); + } + } +} + + +// Destroy - cleanup and remove this class +// NOTE: this destroys the object, and it can't be used past this point +template +void SequenceReader::Destroy() +{ + delete this; +} + +// Init - Reader Initialize for multiple data sets +// config - [in] configuration parameters for the datareader +// Sample format below: +//# Parameter values for the reader +//reader=[ +// # reader to use +// readerType=SequenceReader +// randomize=None +// # additional features dimension +// featureDim=784 +// file=c:\data\sequence\sequence.txt +// labelIn=[ +// dim=26 +// labelMappingFile=c:\data\sequence\alphabet.txt +// labelType=Category +// beginSequence="" +// endSequence="" +// ] +// labelOut=[ +// dim=129 +// labelMappingFile=c:\data\sequence\phonemes.txt +// labelType=Category +// beginSequence="O" +// endSequence="O" +// ] +//] +template +void SequenceReader::Init(const ConfigParameters& readerConfig) +{ + // See if the user wants caching + m_cachingReader = NULL; + m_cachingWriter = NULL; + + // NOTE: probably want to re-enable at some point + + // initialize the cache + //InitCache(readerConfig); + //m_readerConfig = readerConfig; + + //// if we have a cache, no need to parse the test files... + //if (m_cachingReader) + // return; + + std::vector features; + std::vector labels; + GetFileConfigNames(readerConfig, features, labels); + if (features.size() > 0) + { + m_featuresName = features[0]; + } + + if (labels.size() == 2) + { + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + m_labelsName[index] = labels[index]; + } + } + else + RuntimeError("two label definitions (in and out) required for Sequence Reader"); + + ConfigParameters featureConfig = readerConfig(m_featuresName,""); + ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")}; + + class_size = 0; + m_featureDim = featureConfig("dim"); + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + m_labelInfo[index].idMax = 0; + m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", ""); + m_labelInfo[index].endSequence = labelConfig[index]("endSequence", ""); + + // determine label type desired + std::string labelType(labelConfig[index]("labelType","Category")); + if (labelType == "Category") + { + m_labelInfo[index].type = labelCategory; + } + else if (labelType == "NextWord") + { + // in this case, it's all identical to the Input labels, except the data type + m_labelInfo[index].type = labelNextWord; + m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim; + } + else if (labelType == "None") + { + m_labelInfo[index].type = labelNone; + m_labelInfo[index].dim = 0; // override for no labels + } + + // if we have labels, we need a label Mapping file, it will be a file with one label per line + if (m_labelInfo[index].type != labelNone) + { + std::wstring wClassFile = readerConfig("wordclass", ""); + nwords = labelConfig[index]("labelDim"); + if (wClassFile != L""){ + ReadClassInfo(wClassFile , false); + } + + std::vector arrayLabels; + std::wstring labelPath = labelConfig[index]("labelMappingFile"); + if (fexists(labelPath)) + { + LoadLabelFile(labelPath, arrayLabels); + for (int i=0; i < arrayLabels.size(); ++i) + { + LabelType label = arrayLabels[i]; + m_labelInfo[index].mapIdToLabel[i] = label; + m_labelInfo[index].mapLabelToId[label] = i; + } + m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size(); + m_labelInfo[index].mapName = labelPath; + } + else + { + if (wClassFile != L""){ + ReadClassInfo(wClassFile , false); + int iMax = -1, i; + for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++) + { + LabelType label = ptr->first; + i = ptr->second; + iMax = max(i, iMax); + m_labelInfo[index].mapIdToLabel[i] = label; + m_labelInfo[index].mapLabelToId[label] = i; + } + m_labelInfo[index].idMax = (LabelIdType)(iMax+1); + + OrganizeClass(); + + } + m_labelInfo[index].mapName = labelPath; + + m_labelInfo[index].fileToWrite = labelPath; + } + } + + m_labelInfo[index].dim = labelConfig[index]("labelDim"); + + // update dimension if the file says it's bigger + if (m_labelInfo[index].dim < m_labelInfo[index].idMax) + { + m_labelInfo[index].dim = m_labelInfo[index].idMax; + } + } + + // initialize all the variables + m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0; + m_endReached = false; + m_readNextSampleLine = 0; + m_readNextSample = 0; + m_traceLevel = readerConfig("traceLevel","0"); + m_parser.SetTraceLevel(m_traceLevel); + + if (readerConfig.Exists("randomize")) + { + string randomizeString = readerConfig("randomize"); + if (randomizeString == "None") + { + ; + } + else if (randomizeString == "Auto") + { + ; + } + else + { + ;//readerConfig("randomize"); + } + } + else + { + ; //randomizeAuto; + } + + // The input data is a combination of the label Data and extra feature dims together +// m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim; + m_featureCount = 1; + + std::wstring m_file = readerConfig("file"); + if (m_traceLevel > 0) + fprintf(stderr, "reading sequence file %ws\n", m_file.c_str()); + + const LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + const LabelInfo& labelOut = m_labelInfo[labelInfoOut]; + m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence); +} + +template +void SequenceReader::ReadWord(char *word, FILE *fin) +{ + int a=0, ch; + + while (!feof(fin)) { + ch=fgetc(fin); + + if (ch==13) continue; + + if ((ch==' ') || (ch=='\t') || (ch=='\n')) { + if (a>0) { + if (ch=='\n') ungetc(ch, fin); + break; + } + + if (ch=='\n') { + strcpy_s(word, strlen(""), (char *)""); + return; + } + else continue; + } + + word[a]=(char)ch; + a++; + + if (a>=MAX_STRING) { + //printf("Too long word found!\n"); //truncate too long words + a--; + } + } + word[a]=0; +} + +template +void SequenceReader::ReadClassInfo(const wstring & vocfile, bool /*flatten*/) +{ + char strFileName[MAX_STRING]; + char stmp[MAX_STRING]; + string strtmp; + size_t sz; + int cnt, clsidx, b; + class_size = 0; + + wcstombs_s(&sz, strFileName, 2048, vocfile.c_str(), vocfile.length()); + + FILE * vin; + vin = fopen(strFileName, "rt") ; + + if (vin == nullptr) + { + RuntimeError("cannot open word class file"); + } + for (int a = 0; a < nwords; a++) + { + fscanf_s(vin, "%6d\t%10d\t", &b, &cnt); + ReadWord(stmp, vin); + fscanf_s(vin, "%d\t\n", &clsidx); + strtmp = stmp; + idx4cnt[b] = cnt; + word4idx[strtmp] = b; + idx4word[b]= strtmp; + + idx4class[b] = clsidx; + class_size = max(class_size, clsidx); + } + fclose(vin); + + class_size ++; +} + +// InitCache - Initialize the caching reader if cache files exist, otherwise the writer +// readerConfig - reader configuration +template +void SequenceReader::InitCache(const ConfigParameters& readerConfig) +{ + // check for a writer tag first (lets us know we are caching) + if (!readerConfig.Exists("writerType")) + return; + + // first try to open the binary cache + bool found = false; + try + { + // TODO: need to go down to all levels, maybe search for sectionType + ConfigArray filesList(','); + vector names; + if (readerConfig.Exists("wfile")) + { + filesList.push_back(readerConfig("wfile")); + if (fexists(readerConfig("wfile"))) + found = true; + } + FindConfigNames(readerConfig, "wfile", names); + for (auto name : names) + { + ConfigParameters config = readerConfig(name); + filesList.push_back(config("wfile")); + if (fexists(config("wfile"))) + found = true; + } + + // if we have a file already, we are going to read the cached files + if (found) + { + ConfigParameters config; + readerConfig.CopyTo(config); + // mmodify the config so the reader types look correct + config["readerType"] = config("writerType"); + config["file"] = filesList; + m_cachingReader = new DataReader(config); + } + else + { + m_cachingWriter = new DataWriter(readerConfig); + + // now get the section names for map and category types + std::map sections; + m_cachingWriter->GetSections(sections); + for (auto pair : sections) + { + // TODO: we would need to add a sequenceMap type here as well + // or maybe change to heirarchal name (i.e. root.labelIn.map) + if (pair.second == sectionTypeCategoryLabel) + { + m_labelsCategoryName[labelInfoOut] = pair.first; + } + else if (pair.second == sectionTypeLabelMapping) + { + m_labelsMapName[labelInfoOut] = pair.first; + } + } + } + } + catch (runtime_error err) + { + fprintf(stderr,"Error attemping to create Binary%s\n%s\n",found?"Reader":"Writer",err.what()); + delete m_cachingReader; + m_cachingReader = NULL; + delete m_cachingWriter; + m_cachingWriter = NULL; + } + catch (...) + { + // if there is any error, just get rid of the object + fprintf(stderr,"Error attemping to create Binary%s\n",found?"Reader":"Writer"); + delete m_cachingReader; + m_cachingReader = NULL; + delete m_cachingWriter; + m_cachingWriter = NULL; + } +} + +// destructor - virtual so it gets called properly +template +SequenceReader::~SequenceReader() +{ + ReleaseMemory(); + delete m_cachingReader; + delete m_cachingWriter; +} + +// ReleaseMemory - release the memory footprint of SequenceReader +// used when the caching reader is taking over +template +void SequenceReader::ReleaseMemory() +{ + if (m_featuresBuffer!=NULL) + delete[] m_featuresBuffer; + m_featuresBuffer=NULL; + if (m_labelsBuffer!=NULL) + delete[] m_labelsBuffer; + m_labelsBuffer=NULL; + if (m_labelsIdBuffer!=NULL) + delete[] m_labelsIdBuffer; + m_labelsIdBuffer=NULL; + m_featureData.clear(); + m_labelIdData.clear(); + m_labelData.clear(); + m_sequence.clear(); +} + +//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch +template +void SequenceReader::SetupEpoch() +{ + // if we are starting fresh (epoch zero and no data read), init everything + // however if we are using cachingWriter, we need to know record count, so do that first + if (m_epoch == 0 && m_totalSamples == 0 && m_cachingWriter == NULL) + { + m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0; + m_parser.SetFilePosition(0); + } + else // otherwise, position the read to start at the right location + { + m_seqIndex = 0; + // don't know the total number of samples yet, so count them + if (m_totalSamples == 0) + { + if (m_traceLevel > 0) + fprintf(stderr, "starting at epoch %d parsing all data to determine record count\n", m_epoch); + // choose a large number to read + m_parser.SetFilePosition(0); + m_mbStartSample = 0; + while (EnsureDataAvailable(m_mbStartSample)) + { + m_mbStartSample = m_totalSamples; + m_seqIndex = m_sequence.size(); + } + if (m_traceLevel > 0) + fprintf(stderr, "\n %lld records found\n", m_totalSamples); + } + m_seqIndex = 0; + + // we have a slight delima here, if we haven't determined the end of the file yet + // and the user told us to find how many records are in the file, we can't distinguish "almost done" + // with a file (a character away) and the middle of the file. So read ahead a record to see if it's there. + bool endReached = m_endReached; + if (!endReached) + { + if (!m_parser.HasMoreData()) + { + endReached = true; + UpdateDataVariables(); + assert(m_endReached); + } + } + + // always start from the first sample + m_epochStartSample = m_mbStartSample = 0; + } +} + +template +void SequenceReader::LMSetupEpoch() +{ + m_readNextSampleLine = m_readNextSample = m_epochStartSample = m_mbStartSample = m_seqIndex = 0; +} + +// utility function to round an integer up to a multiple of size +size_t RoundUp(size_t value, size_t size) +{ + return ((value + size -1)/size)*size; +} + +//StartMinibatchLoop - Startup a minibatch loop +// mbSize - [in] size of the minibatch (number of Samples, etc.) +// NOTE: for sequence data, this will be the MAX size of a sequence, as every sequence could be a different length +// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run) +// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset +template +void SequenceReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) +{ + // if we aren't currently caching, see if we can use a cache + if (!m_cachingReader && !m_cachingWriter) + { + InitCache(m_readerConfig); + if (m_cachingReader) + ReleaseMemory(); // free the memory used by the SequenceReader + } + + // if we are reading from the cache, do so now and return + if (m_cachingReader) + { + m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples); + return; + } + + if (m_featuresBuffer==NULL) + { + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + m_featuresBuffer = new ElemType[mbSize*labelInfo.dim]; + memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim); + } + + if (m_labelsBuffer==NULL) + { + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + if (labelInfo.type == labelCategory) + { + m_labelsBuffer = new ElemType[labelInfo.dim*mbSize]; + memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize); + m_labelsIdBuffer = new IDataReader::LabelIdType[mbSize]; + memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*mbSize); + } + else if (labelInfo.type != labelNone) + { + m_labelsBuffer = new ElemType[mbSize]; + memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize); + m_labelsIdBuffer = NULL; + } + } + + m_mbSize = mbSize; + if (requestedEpochSamples == requestDataSize) + { + if (!m_endReached) + { + m_epochSize = requestDataSize; + } + } + else + { + m_epochSize = requestedEpochSamples; + } + + // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set + size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize; + m_epoch = epoch; + m_mbStartSample = epoch*m_epochSize; + + // allocate room for the data + m_featureData.reserve(m_featureCount*epochSize); + if (m_labelInfo[labelInfoOut].type == labelCategory) + m_labelIdData.reserve(epochSize); + else if (m_labelInfo[labelInfoOut].type != labelNone) + m_labelData.reserve(epochSize); + m_sequence.reserve(m_seqIndex); // clear out the sequence array + /// this is too complicated for LM + // SetupEpoch(); + /// use the LMSetupEpoch() instead + LMSetupEpoch(); + + m_clsinfoRead = false; + m_idx2clsRead = false; + + m_parser.ParseReset(); +} + +template +bool SequenceReader::DataEnd(EndDataType endDataType) +{ + bool ret = false; + switch (endDataType) + { + case endDataNull: + assert(false); + break; + case endDataEpoch: + ret = m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1]; + break; + case endDataSet: + ret = !EnsureDataAvailable(m_mbStartSample); + break; + case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true + ret = SentenceEnd(); + break; + } + return ret; +} + + +template +bool SequenceReader::SentenceEnd() +{ + // this is after getMinibatch size, which has increased m_seqIndex by 1 + // so the real index is m_seqIndex - 1; + int seqIndex = (int)m_seqIndex - 1; + + // now get the labels + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + + size_t actualmbsize = 0; + + // figure out the size of the next sequence + if (seqIndex > 0) + { + actualmbsize = m_sequence[seqIndex] - m_sequence[seqIndex-1]; + } + else + { + actualmbsize = m_sequence[0]; + } + + if (actualmbsize < m_mbSize) + return true; + + size_t jEnd = m_sequence[seqIndex]-1; + + if (labelInfo.type == labelCategory) + { + LabelIdType index ; + if (CheckIdFromLabel(labelInfo.endSequence, labelInfo, index) == false) + RuntimeError("cannot find sentence begining label"); + + if (m_labelIdData[jEnd] == index ) + return true; + else + return false; + } + return false; +} + +template +void SequenceReader::GetLabelOutput(std::map*>& matrices, + size_t m_mbStartSample, size_t actualmbsize) +{ + size_t j = 0; + Matrix* labels = matrices[m_labelsName[labelInfoOut]]; + if (labels == nullptr) return; + + labels->Resize(nwords + class_size, actualmbsize, false); + + for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) + { + // pick the right sample with randomization if desired + size_t jRand = jSample; + + int wrd = m_labelIdData[jRand]; + int clsidx = idx4class[wrd]; + + labels->SetValue(wrd, j, 1); + + if (class_size > 0) + labels->SetValue(nwords + clsidx, j, 1); + } + +} + +template +void SequenceReader::GetInputToClass(std::map*>& matrices) +{ + Matrix* idx2cls= matrices[STRIDX2CLS]; + if (idx2cls== nullptr) return; + + if (m_idx2clsRead) return; + + // populate local CPU matrix + m_id2classLocal->SwitchToMatrixType(MatrixType::DENSE); + m_id2classLocal->Resize(nwords , 1, false); + + //move to CPU since element-wise operation is expensive and can go wrong in GPU + int curDevId = m_id2classLocal->GetDeviceId(); + m_id2classLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); + for (size_t j = 0; j < nwords ; j++) + { + int clsidx = idx4class[(int)j]; + (*m_id2classLocal)(j,0) = (float)clsidx; + } + m_id2classLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + + int oldDeviceId = idx2cls->GetDeviceId(); + // caution, SetValue changes idx2cls from GPU to CPU, may change this behavior later + idx2cls->SetValue(*m_id2classLocal); + idx2cls->TransferFromDeviceToDevice(idx2cls->GetDeviceId(), oldDeviceId, true); + + m_idx2clsRead = true; +} + +template +void SequenceReader::GetClassInfo(std::map*>& matrices) +{ + Matrix* clsinfo = matrices[CLASSINFO]; + if (clsinfo == nullptr) return; + + if (m_clsinfoRead) return; + + // populate local CPU matrix + m_classInfoLocal->SwitchToMatrixType(MatrixType::DENSE); + m_classInfoLocal->Resize(2, class_size); + + //move to CPU since element-wise operation is expensive and can go wrong in GPU + int curDevId = m_classInfoLocal->GetDeviceId(); + m_classInfoLocal->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); + + int clsidx; + int prvcls = -1; + for (size_t j = 0; j < nwords; j++) + { + clsidx = idx4class[(int)j]; + if (prvcls != clsidx) + { + if (prvcls >= 0) + (*m_classInfoLocal)(1, prvcls) = (float)j; + prvcls = clsidx; + (*m_classInfoLocal)(0, prvcls) = (float)j; + } + } + (*m_classInfoLocal)(1, prvcls) = (float)nwords; + + m_classInfoLocal->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + + int oldDeviceId = clsinfo->GetDeviceId(); + // caution, SetValue changes m_classInfoLocal from GPU to CPU, may change this behavior later + clsinfo->SetValue(*m_classInfoLocal); + clsinfo->TransferFromDeviceToDevice(clsinfo->GetDeviceId(), oldDeviceId, true); + + m_clsinfoRead = true; +} + +template +bool SequenceReader::GetMinibatch(std::map*>& matrices) +{ + + // get out if they didn't call StartMinibatchLoop() first + if (m_mbSize == 0) + return false; + + // check to see if we have changed epochs, if so we are done with this one. + if (m_sequence.size() > 0 && m_mbStartSample > m_sequence[m_sequence.size()-1]) + return false; + + bool moreData = EnsureDataAvailable(m_mbStartSample); + if (moreData == false) + return false; + + // figure which sweep of the randomization we are on + size_t recordStart = m_totalSamples?m_mbStartSample%m_totalSamples:m_mbStartSample; + + // actual size is the size of the next seqence + size_t actualmbsize = 0; + + // figure out the size of the next sequence + if (m_seqIndex > 0 && m_seqIndex < m_sequence.size() && m_sequence.size() > 1) + { + actualmbsize = m_sequence[m_seqIndex] - m_sequence[m_seqIndex-1]; + } + else + { + actualmbsize = m_sequence[0]; + } + + if (actualmbsize > m_mbSize){ + RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize); + } + + // hit the end of the dataset, + if (!moreData) + { + // make sure we take into account hitting the end of the dataset (not wrapping around) + actualmbsize = min(m_totalSamples-recordStart,actualmbsize); + } + + // now get the labels + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + + if (labelInfo.type == labelCategory) + { + memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*actualmbsize); + memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*actualmbsize); + } + else if (labelInfo.type != labelNone) + { + memset(m_labelsBuffer,0,sizeof(ElemType)*1*actualmbsize); + } + + if (actualmbsize > 0) + { + + memset(m_featuresBuffer, 0, sizeof(ElemType)*actualmbsize*labelInfo.dim); + + //loop through all the samples + int j = 0; + Matrix& features = *matrices[m_featuresName]; + if (matrices.find(m_featuresName) != matrices.end()) + { + if(features.GetMatrixType() == MatrixType::DENSE) + { + features.Resize(labelInfo.dim, actualmbsize, false); + features.SetValue(0); + } + else + { + features.Resize(labelInfo.dim, actualmbsize); + features.Reset(); + } + } + + for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) + { + // pick the right sample with randomization if desired + size_t jRand = jSample; + + // vector of feature data goes into matrix column + size_t idx = (size_t)m_featureData[jRand]; + m_featuresBuffer[j*labelInfo.dim + idx] = (ElemType)1; + + if (matrices.find(m_featuresName) != matrices.end()) + features.SetValue(idx, j, (ElemType)1); + } + + GetLabelOutput(matrices, m_mbStartSample, actualmbsize); + GetInputToClass(matrices); + GetClassInfo(matrices); + + // make sure that the sequence index matches our end index + assert(m_sequence[m_seqIndex] == m_mbStartSample+actualmbsize); + // go to the next sequence + m_seqIndex++; + } + + // advance to the next minibatch + m_mbStartSample += actualmbsize; + + // if they don't want partial minibatches, skip data transfer and return + if (actualmbsize == 0) // no records found (end of minibatch) + { + return false; + } + + // now transfer to the GPU as needed + try{ + // get the features array + if (matrices.find(m_featuresName) == matrices.end()) + { + Matrix& nbs = *matrices[L"numberobs"]; + int curDevId = nbs.GetDeviceId(); + nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); + nbs(0,0) = (float)actualmbsize; + nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + for (size_t i = 0; i < actualmbsize; i++) + { + std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i); + Matrix& features = *matrices[ws]; + features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal); + } + } + }catch(...) + { + RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition" ,actualmbsize); + } + + try + { + if (labelInfo.type == labelCategory) + { + if (matrices.find(m_labelsName[labelInfoOut]) == matrices.end()) + { + for (size_t i = 0; i < actualmbsize; i++) + { + std::wstring ws = msra::strfun::wstrprintf (L"label%d", i); + Matrix* labels = matrices[ws]; + labels->SetValue(labelInfo.dim, 1, &m_labelsBuffer[i * labelInfo.dim],matrixFlagNormal); + } + } + } + else if (labelInfo.type != labelNone) + { + Matrix* labels = matrices[m_labelsName[labelInfoOut]]; + labels->SetValue(1, actualmbsize,m_labelsBuffer,matrixFlagNormal); + } + }catch(...) + { + RuntimeError("cannot find matrices for %s", m_labelsName[labelInfoOut]); + } + + // we read some records, so process them + return true; +} + +template +void SequenceReader::OrganizeClass() +{ + //allocate auxiliary class variables (for faster search when normalizing probability at output layer) + int cl, i; + for (i=0; i +const std::map::LabelIdType, typename IDataReader::LabelType>& SequenceReader::GetLabelMapping(const std::wstring& sectionName) +{ + if (m_cachingReader) + { + return m_cachingReader->GetLabelMapping(sectionName); + } + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + + return labelInfo.mapIdToLabel; +} + +// SetLabelMapping - Sets the label mapping from integer index to label +// labelMapping - mapping table from label values to IDs (must be 0-n) +// note: for tasks with labels, the mapping table must be the same between a training run and a testing run +template +void SequenceReader::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map::LabelIdType, typename LabelType>& labelMapping) +{ + if (m_cachingReader) + { + RuntimeError("Cannot set mapping table when the caching reader is being used"); + } + LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + + labelInfo.mapIdToLabel = labelMapping; + labelInfo.mapLabelToId.clear(); + for (std::pair var : labelMapping) + { + labelInfo.mapLabelToId[var.second] = var.first; + } +} + +// GetData - Gets metadata from the specified section (into CPU memory) +// sectionName - section name to retrieve data from +// numRecords - number of records to read +// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request +// dataBufferSize - [in] size of the databuffer in bytes +// [out] size of buffer filled with data +// recordStart - record to start reading from, defaults to zero (start of data) +// returns: true if data remains to be read, false if the end of data was reached +template +bool SequenceReader::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart) +{ + if (!m_cachingReader) + RuntimeError("GetData not supported in SequenceReader"); + return m_cachingReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart); +} + +// instantiate all the combinations we expect to be used +template class SequenceReader; +template class SequenceReader; + +template +void BatchSequenceReader::Init(const ConfigParameters& readerConfig) +{ + // See if the user wants caching + m_cachingReader = NULL; + m_cachingWriter = NULL; + + // NOTE: probably want to re-enable at some point + + // initialize the cache + //InitCache(readerConfig); + //m_readerConfig = readerConfig; + + //// if we have a cache, no need to parse the test files... + //if (m_cachingReader) + // return; + + std::vector features; + std::vector labels; + GetFileConfigNames(readerConfig, features, labels); + if (features.size() > 0) + { + m_featuresName = features[0]; + } + + if (labels.size() == 2) + { + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + m_labelsName[index] = labels[index]; + } + } + else + RuntimeError("two label definitions (in and out) required for Sequence Reader"); + + ConfigParameters featureConfig = readerConfig(m_featuresName,""); + ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")}; + + class_size = 0; + m_featureDim = featureConfig("dim"); + for (int index = labelInfoMin; index < labelInfoMax; ++index) + { + m_labelInfo[index].idMax = 0; + m_labelInfo[index].beginSequence = labelConfig[index]("beginSequence", ""); + m_labelInfo[index].endSequence = labelConfig[index]("endSequence", ""); + + // determine label type desired + std::string labelType(labelConfig[index]("labelType","Category")); + if (labelType == "Category") + { + m_labelInfo[index].type = labelCategory; + } + else if (labelType == "NextWord") + { + // in this case, it's all identical to the Input labels, except the data type + m_labelInfo[index].type = labelNextWord; + m_labelInfo[index].dim = m_labelInfo[labelInfoIn].dim; + } + else if (labelType == "None") + { + m_labelInfo[index].type = labelNone; + m_labelInfo[index].dim = 0; // override for no labels + } + + // if we have labels, we need a label Mapping file, it will be a file with one label per line + if (m_labelInfo[index].type != labelNone) + { + std::wstring wClassFile = readerConfig("wordclass", ""); + nwords = labelConfig[index]("labelDim"); + if (wClassFile != L""){ + ReadClassInfo(wClassFile , false); + } + + std::vector arrayLabels; + std::wstring labelPath = labelConfig[index]("labelMappingFile"); + if (fexists(labelPath)) + { + LoadLabelFile(labelPath, arrayLabels); + for (int i=0; i < arrayLabels.size(); ++i) + { + LabelType label = arrayLabels[i]; + m_labelInfo[index].mapIdToLabel[i] = label; + m_labelInfo[index].mapLabelToId[label] = i; + } + m_labelInfo[index].idMax = (LabelIdType)arrayLabels.size(); + m_labelInfo[index].mapName = labelPath; + } + else + { + if (wClassFile != L""){ + ReadClassInfo(wClassFile , false); + int iMax = -1, i; + for (auto ptr = word4idx.begin(); ptr != word4idx.end(); ptr++) + { + LabelType label = ptr->first; + i = ptr->second; + iMax = max(i, iMax); + m_labelInfo[index].mapIdToLabel[i] = label; + m_labelInfo[index].mapLabelToId[label] = i; + } + m_labelInfo[index].idMax = (LabelIdType)(iMax+1); + + OrganizeClass(); + + } + m_labelInfo[index].mapName = labelPath; + + m_labelInfo[index].fileToWrite = labelPath; + } + } + + m_labelInfo[index].dim = labelConfig[index]("labelDim"); + + // update dimension if the file says it's bigger + if (m_labelInfo[index].dim < m_labelInfo[index].idMax) + { + m_labelInfo[index].dim = m_labelInfo[index].idMax; + } + } + + // initialize all the variables + m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = m_seqIndex = 0; + m_endReached = false; + m_readNextSampleLine = 0; + m_readNextSample = 0; + m_traceLevel = readerConfig("traceLevel","0"); + m_parser.SetTraceLevel(m_traceLevel); + + if (readerConfig.Exists("randomize")) + { + string randomizeString = readerConfig("randomize"); + if (randomizeString == "None") + { + ; + } + else if (randomizeString == "Auto") + { + ; + } + else + { + ;//readerConfig("randomize"); + } + } + else + { + ; //randomizeAuto; + } + + // The input data is a combination of the label Data and extra feature dims together +// m_featureCount = m_featureDim + m_labelInfo[labelInfoIn].dim; + m_featureCount = 1; + + std::wstring m_file = readerConfig("file"); + if (m_traceLevel > 0) + fprintf(stderr, "reading sequence file %ws\n", m_file.c_str()); + + const LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + const LabelInfo& labelOut = m_labelInfo[labelInfoOut]; + m_parser.ParseInit(m_file.c_str(), m_featureDim, labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence); + + mBlgSize = readerConfig("nbruttsineachrecurrentiter", "1"); +} + +template +void BatchSequenceReader::Reset() +{ + mProcessed.clear(); + mToProcess.clear(); + mLastProcssedSentenceId = 0; + mPosInSentence = 0; + mLastPosInSentence = 0; + mNumRead = 0; + + if (m_labelTemp.size() > 0) + m_labelTemp.clear(); + if (m_featureTemp.size() > 0) + m_featureTemp.clear(); + m_parser.mSentenceIndex2SentenceInfo.clear(); +} + +template +void BatchSequenceReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) +{ + // if we aren't currently caching, see if we can use a cache + if (!m_cachingReader && !m_cachingWriter) + { + InitCache(m_readerConfig); + if (m_cachingReader) + ReleaseMemory(); // free the memory used by the SequenceReader + } + + // if we are reading from the cache, do so now and return + if (m_cachingReader) + { + m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples); + return; + } + + if (m_featuresBuffer==NULL) + { + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + m_featuresBuffer = new ElemType[mbSize*labelInfo.dim]; + memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*labelInfo.dim); + } + + if (m_labelsBuffer==NULL) + { + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + if (labelInfo.type == labelCategory) + { + m_labelsBuffer = new ElemType[labelInfo.dim*mbSize]; + memset(m_labelsBuffer,0,sizeof(ElemType)*labelInfo.dim*mbSize); + m_labelsIdBuffer = new IDataReader::LabelIdType[mbSize]; + memset(m_labelsIdBuffer,0,sizeof(IDataReader::LabelIdType)*mbSize); + } + else if (labelInfo.type != labelNone) + { + m_labelsBuffer = new ElemType[mbSize]; + memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize); + m_labelsIdBuffer = NULL; + } + } + + m_featuresBufferRow = new size_t[mbSize]; + m_featuresBufferRowIdx = new size_t[mbSize]; + + m_labelsIdBufferRow = new CPUSPARSE_INDEX_TYPE[2 * mbSize]; + m_labelsBlock2Id = new size_t[2*mbSize]; + m_labelsBlock2UniqId = new size_t[2*mbSize]; + + m_id2classLocal = new Matrix(CPUDEVICE); + m_classInfoLocal = new Matrix(CPUDEVICE); + + m_mbSize = mbSize; + if (requestedEpochSamples == requestDataSize) + { + if (!m_endReached) + { + m_epochSize = requestDataSize; + } + } + else + { + m_epochSize = requestedEpochSamples; + } + + // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set + size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize; + m_epoch = epoch; + m_mbStartSample = epoch*m_epochSize; + + // allocate room for the data + m_featureData.reserve(m_featureCount*epochSize); + if (m_labelInfo[labelInfoOut].type == labelCategory) + m_labelIdData.reserve(epochSize); + else if (m_labelInfo[labelInfoOut].type != labelNone) + m_labelData.reserve(epochSize); + m_sequence.reserve(m_seqIndex); // clear out the sequence array + /// this is too complicated for LM + // SetupEpoch(); + /// use the LMSetupEpoch() instead + LMSetupEpoch(); + + m_clsinfoRead = false; + m_idx2clsRead = false; + + m_parser.ParseReset(); + + Reset(); +} + +template +size_t BatchSequenceReader::FindNextSentences(size_t numRead) +{ + size_t sln = 0; + + if (numRead == 0) return 0; + + if (mProcessed.size() == 0) + { + mProcessed.resize(numRead, false); + } + + if (mToProcess.size() > 0) + { + bool allDone = false; + for (int s = 0; s < mToProcess.size(); s++) + { + int mp = (int)mToProcess[s]; + if (mProcessed[mp]) + { + mLastProcssedSentenceId = mp; + mLastPosInSentence = 0; + allDone = true; + break; + } + } + if (allDone) + { + mToProcess.clear(); + } + } + + if (mToProcess.size() > 0) + { + sln = m_parser.mSentenceIndex2SentenceInfo[mToProcess[0]].sLen; + return sln; + } + + for (size_t seq = mLastProcssedSentenceId ; seq < numRead; seq++) + { + if (mProcessed[seq]) continue; + + if (sln == 0) + { + sln = m_parser.mSentenceIndex2SentenceInfo[seq].sLen; + } + if (sln == m_parser.mSentenceIndex2SentenceInfo[seq].sLen && + mProcessed[seq] == false && mToProcess.size() < mBlgSize) + mToProcess.push_back(seq); + + if (mToProcess.size() == mBlgSize) break; + } + + return sln; +} + +template +bool BatchSequenceReader::EnsureDataAvailable(size_t /*mbStartSample*/) +{ + bool bDataIsThere = true; + + m_featureData.clear(); + m_labelIdData.clear(); + + // now get the labels + LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + + bool nextWord = false; + if (m_labelInfo[labelInfoOut].type == labelNextWord) + { + nextWord = true; + } + LabelInfo& labelInfo = m_labelInfo[nextWord?labelInfoIn:labelInfoOut]; + + // see how many we already read + std::vector seqPos; + + size_t sLn = FindNextSentences(mNumRead); + if (sLn == 0) + { + Reset(); + + mNumRead = m_parser.Parse(CACHE_BLOG_SIZE, &m_labelTemp, &m_featureTemp, &seqPos); + if (mNumRead == 0) return false; + + std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end()); + + m_readNextSampleLine += mNumRead; + sLn = FindNextSentences(mNumRead); + } + + /// add one minibatch + size_t i = mLastPosInSentence; + size_t j = 0; + // exclude the last token since it is the last label to be predicted + for (i = mLastPosInSentence; j < m_mbSize && i < sLn-1; i++ , j++) + { + for (int k = 0; k < mToProcess.size(); k++) + { + size_t seq = mToProcess[k]; + size_t label = m_parser.mSentenceIndex2SentenceInfo[seq].sBegin + i; + + // labelIn should be a category label + LabelType labelValue = m_labelTemp[label++]; + + // to-do, should ignore , check the sentence ending is + // need to remove from the training set + // allocate and initialize the next chunck of featureData + if (labelIn.type == labelCategory) + { + LabelIdType index = GetIdFromLabel(labelValue, labelIn); + + // use the found value, and set the appropriate location to a 1.0 + assert(labelIn.dim > index); // if this goes off labelOut dimension is too small + m_featureData.push_back((float)index); + } + else + { + RuntimeError("Input label expected to be a category label"); + } + + // now get the output label + if (m_labelInfo[labelInfoOut].type == labelCategory) + { + labelValue = m_labelTemp[label++]; + } + else if (nextWord) + { + // this is the next word (label was incremented above) + labelValue = m_labelTemp[label]; + if (!_stricmp(labelValue.c_str(), m_labelInfo[labelInfoIn].endSequence.c_str())) + { + labelValue = labelInfo.endSequence; + } + } + else + { + RuntimeError("Invalid output label type, expected Category, or Next Word"); + } + + // get the ID from the label + LabelIdType id = GetIdFromLabel(labelValue, labelInfo); + m_labelIdData.push_back(id); + + m_totalSamples ++; + } + } + + mLastPosInSentence = i; + + return bDataIsThere; +} + +template +size_t BatchSequenceReader::NumberSlicesInEachRecurrentIter() +{ + size_t sz = mToProcess.size(); + return sz; +} + +template +void BatchSequenceReader::SetNbrSlicesEachRecurrentIter(const size_t mz) +{ + mBlgSize = mz; +} + +template +bool BatchSequenceReader::GetMinibatch(std::map*>& matrices) +{ + + // get out if they didn't call StartMinibatchLoop() first + if (m_mbSize == 0) + return false; + + bool moreData = EnsureDataAvailable(m_mbStartSample); + if (moreData == false) + return false; + + // actual size is the size of the next seqence + size_t actualmbsize = 0; + + // figure out the size of the next sequence + actualmbsize = m_labelIdData.size() ; + if (actualmbsize > m_mbSize * mToProcess.size()){ + RuntimeError("specified minibatch size %d is smaller than the actual minibatch size %d. memory can crash!", m_mbSize, actualmbsize); + } + + // now get the labels + const LabelInfo& labelInfo = m_labelInfo[( m_labelInfo[labelInfoOut].type == labelNextWord)?labelInfoIn:labelInfoOut]; + + if (actualmbsize > 0) + { + + //loop through all the samples + Matrix& features = *matrices[m_featuresName]; + + // copy m_featureData to matrix + // we always copy it to cpu first and then convert to gpu if gpu is desired. + DEVICEID_TYPE featureDeviceId = features.GetDeviceId(); + features.TransferFromDeviceToDevice(featureDeviceId, CPUDEVICE, false, true, false); + + if (features.GetMatrixType() == MatrixType::DENSE) + { + features.Resize(labelInfo.dim, actualmbsize); + features.SetValue(0); + } + else + { + features.Resize(labelInfo.dim, actualmbsize, actualmbsize); + features.Reset(); + } + + for (size_t j = 0; j < actualmbsize; ++j) + { + // vector of feature data goes into matrix column + size_t idx = (size_t)m_featureData[j]; + + //if (matrices.find(m_featuresName) != matrices.end()) + features.SetValue(idx, j, (ElemType)1); + } + + features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false); + + //else // for GPU + //{ + // if (matrices.find(m_featuresName) != matrices.end()) + // { + // m_indexer.clear(); + // size_t size = m_featureData.size(); + + // for(int i = 0; i < size; i++) + // { + // m_featuresBufferRow[i] = (size_t)m_featureData[i]; + // if(m_indexer.find(m_featuresBufferRow[i]) == m_indexer.end()) + // { + // m_indexer[m_featuresBufferRow[i]] = m_indexer.size(); + // } + // m_featuresBufferRowIdx[i] = m_indexer[m_featuresBufferRow[i]]; + // } + // features.SetMatrixFromCSCFormat(m_featuresBufferRow, m_featuresBufferRowIdx, size, m_indexer.size()); + // } + //} + + // TODO: move these two methods to startMiniBatchLoop() + GetInputToClass(matrices); + GetClassInfo(matrices); + GetLabelOutput(matrices, 0, actualmbsize); + + // go to the next sequence + m_seqIndex++; + } + else + return false; + + // now transfer to the GPU as needed + try{ + // get the features array + if (matrices.find(m_featuresName) == matrices.end()) + { + Matrix& nbs = *matrices[L"numberobs"]; + int curDevId = nbs.GetDeviceId(); + nbs.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false); + nbs(0,0) = (float)actualmbsize; + nbs.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + for (size_t i = 0; i < actualmbsize; i++) + { + std::wstring ws = msra::strfun::wstrprintf (L"feature%d", i); + Matrix& features = *matrices[ws]; + features.SetValue(labelInfo.dim, 1, &m_featuresBuffer[i*labelInfo.dim],matrixFlagNormal); + } + } + }catch(...) + { + RuntimeError("features size might not be sufficiently large. The asked minibatch size is %s. check minibatchSize in the feature definition" ,actualmbsize); + } + + // we read some records, so process them + return true; +} + +template +void BatchSequenceReader::SetSentenceEnd(int wrd, int pos, int actualMbSize) +{ + // now get the labels + LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + LabelIdType index = GetIdFromLabel(labelIn.endSequence.c_str(), labelIn); + + if (pos == actualMbSize - 1) + { + if (wrd == (int)index) + mSentenceEnd = true; + else + mSentenceEnd = false; + } +} + +template +void BatchSequenceReader::SetSentenceBegin(int wrd, int pos, int /*actualMbSize*/) +{ + // now get the labels + LabelInfo& labelIn = m_labelInfo[labelInfoIn]; + LabelIdType index = GetIdFromLabel(labelIn.beginSequence.c_str(), labelIn); + + if (pos == 0) + { + if (wrd == (int)index) + mSentenceBegin = true; + else + mSentenceBegin = false; + } +} + +template +void BatchSequenceReader::SetSentenceEndInBatch(vector &sentenceEnd) +{ + sentenceEnd.resize(mToProcess.size()); + if (mSentenceBegin) + { + sentenceEnd.assign(mToProcess.size(), 0); + } + else + { + sentenceEnd.assign(mToProcess.size(), m_mbSize+2); + } +} + +template +bool BatchSequenceReader::DataEnd(EndDataType endDataType) +{ + bool ret = false; + switch (endDataType) + { + case endDataNull: + assert(false); + break; + case endDataEpoch: + case endDataSet: + ret = !EnsureDataAvailable(m_mbStartSample); + break; + case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true + if (mSentenceEnd) + { + for (auto ptr = mToProcess.begin(); ptr != mToProcess.end(); ptr++) + mProcessed[*ptr] = true; + } + ret = mSentenceEnd; + break; + } + return ret; + +} + +template +void BatchSequenceReader::GetLabelOutput(std::map*>& matrices, + size_t m_mbStartSample, size_t actualmbsize) +{ + size_t j = 0; + Matrix* labels = matrices[m_labelsName[labelInfoOut]]; + if (labels == nullptr) return; + + if(labels->GetMatrixType() == MatrixType::DENSE) + { + labels->Resize(nwords + class_size, actualmbsize, false); + labels->SetValue(0); + } + else + { + labels->Resize(nwords + class_size, actualmbsize, 2*actualmbsize); + labels->Reset(); + } + + if(labels->GetCurrentMatrixLocation() == CPU) { + for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) + { + // pick the right sample with randomization if desired + size_t jRand = jSample; + + int wrd = m_labelIdData[jRand]; + int clsidx = idx4class[wrd]; + + labels->SetValue(wrd, j, 1); + + SetSentenceEnd(wrd, j, actualmbsize); + SetSentenceBegin(wrd, j, actualmbsize); + + if (class_size > 0) + labels->SetValue(nwords + clsidx, j, 1); + } + } + else // GPU + { + m_indexer.clear(); + int p = 0; + int b = 0; + int nz = 0; + + for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample) + { + // pick the right sample with randomization if desired + size_t jRand = jSample; + int wrd = m_labelIdData[jRand]; + int clsidx = idx4class[wrd]; + SetSentenceEnd(wrd, j, actualmbsize); + SetSentenceBegin(wrd, j, actualmbsize); + + int start[2]; + int end[2]; + int target[2]; + int blockId[2]; + + start[0] = (int)(*m_classInfoLocal)(0, clsidx); + end[0] = (int)(*m_classInfoLocal)(1, clsidx); + target[0] = wrd; + blockId[0] = clsidx; + start[1] = nwords; + end[1] = nwords + (int)(*m_classInfoLocal).GetNumCols(); + target[1] = nwords + clsidx; + blockId[1] = -1; + + for(int i = 0; i < 2; i++) + { + m_labelsIdBufferRow[p] = target[i]; + int len = end[i] - start[i]; + + if(m_indexer.find(blockId[i]) == m_indexer.end()) + { + m_indexer[blockId[i]] = b; + b += len; + } + m_labelsBlock2Id[p] = nz; + m_labelsBlock2UniqId[p] = m_indexer[blockId[i]]; + nz += len; + p++; + } + } + + labels->SetMatrixFromLabelAndClass(m_labelsIdBufferRow, m_labelsBlock2Id, m_labelsBlock2UniqId, 2*actualmbsize, nz, b); + } +} + +template class BatchSequenceReader; +template class BatchSequenceReader; + }}} \ No newline at end of file diff --git a/MachineLearning/CNTKEval/CNTKEval.vcxproj b/MachineLearning/CNTKEval/CNTKEval.vcxproj index f1dfe619a..2a981d6d6 100644 --- a/MachineLearning/CNTKEval/CNTKEval.vcxproj +++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj @@ -60,7 +60,7 @@ - Use + NotUsing Level4 Disabled EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions) @@ -79,7 +79,7 @@ Level4 - Use + NotUsing MaxSpeed true true @@ -107,6 +107,7 @@ + @@ -127,6 +128,7 @@ NotUsing + NotUsing diff --git a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters index f19953971..2c505bfdf 100644 --- a/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters +++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj.filters @@ -1,50 +1,56 @@ - - - - - - - - - - - Common - - - Common - - - Common - - - Common - - - - - - - - - - Common\Include - - - Common\Include - - - Common\Include - - - Common\Include - - - - - {bed53b47-70b1-494c-824d-0748362003b2} - - - {f3bf0104-8a08-40c9-a4d9-af8411c49669} - - + + + + + + + + + + + Common + + + Common + + + Common + + + Common + + + Common + + + + + + + + + + Common\Include + + + Common\Include + + + Common\Include + + + Common\Include + + + Common\Include + + + + + {bed53b47-70b1-494c-824d-0748362003b2} + + + {f3bf0104-8a08-40c9-a4d9-af8411c49669} + + \ No newline at end of file diff --git a/MachineLearning/cn/ComputationNode.h b/MachineLearning/cn/ComputationNode.h index 7ec090964..1012db450 100644 --- a/MachineLearning/cn/ComputationNode.h +++ b/MachineLearning/cn/ComputationNode.h @@ -3111,6 +3111,9 @@ protected: \ inputGradientValues.Print("child Gradient-in/out"); inputFunctionValues.Print("child Function values"); #endif + //currently we only support one combination when the input is sparse. + if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE) + inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol); Matrix::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues); #if DUMPOUTPUT diff --git a/MachineLearning/cn/NetworkDescriptionLanguage.h b/MachineLearning/cn/NetworkDescriptionLanguage.h index e2e2fdfb3..cf158d785 100644 --- a/MachineLearning/cn/NetworkDescriptionLanguage.h +++ b/MachineLearning/cn/NetworkDescriptionLanguage.h @@ -1,1067 +1,1072 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#pragma once -#include "commandArgUtil.h" -#include "ComputationNode.h" -#include "TrainingCriterionNode.h" -#include "CompositeComputationNode.h" -#include "EvaluationCriterionNode.h" -#include "ComputationNetwork.h" -#include - -namespace Microsoft { namespace MSR { namespace CNTK { - -// EqualInsensitive - check to see if two nodes are equal up to the length of the first string (must be at least half as long as actual node name) -// string1 - [in,out] string to compare, if comparision is equal insensitive but not sensitive, will replace with sensitive version -// string2 - second string to compare -// alternate - alternate naming of the string -// return - true if strings are equal insensitive and modifies string1 to sensitive version if different -bool EqualInsensitive(std::wstring& string1, const std::wstring& string2, const wchar_t* alternate=NULL); - -// CheckFunction - check to see if we match a function name -// string1 - [in,out] string to compare, if comparision is equal and at least half the full node name will replace with full node name -// allowUndeterminedVariable - [out] set to true if undetermined variables (symbols yet to be defined) are allowed here -// return - true if function name found -template -bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable=nullptr); - -// NDLType - Network Description Language node type -enum NDLType -{ - ndlTypeNull, - ndlTypeConstant, - ndlTypeFunction, - ndlTypeVariable, - ndlTypeParameter, // parameter value, must be looked up to get actual value - ndlTypeUndetermined, // an undetermined value that will later be resolved - ndlTypeOptionalParameter, - ndlTypeArray, - ndlTypeMacroCall, // calling a macro - ndlTypeMacro, // definition of a macro - ndlTypeMax -}; - -// NDLPass - enumeration for the number of passes through the NDL parser -enum NDLPass -{ - ndlPassInitial, // inital pass, create nodes - ndlPassResolve, // resolve any undetermined symbols (variables that were not yet declared in NDL) - ndlPassFinal, // final pass done post-validation (when all matrices are allocated to the correct size) - ndlPassAll = ndlPassFinal, // all passes, used as flag in NDLUtil.h - ndlPassMax // number of NDLPasses -}; - -// ++ operator for this enum, so loops work -NDLPass &operator++(NDLPass &ndlPass); - -// Predeclaration of Script and Node -template -class NDLScript; - -template -class NDLNode; - -// NDLNodeEvaluator - Node evaluaton interface -// implemented by execution engines to convert script to approriate internal formats -template -class NDLNodeEvaluator -{ -public: - virtual void Evaluate(NDLNode* node, const wstring& baseName, const NDLPass pass) = 0; - virtual ~NDLNodeEvaluator() = 0; - - // EvaluateParameter - Evaluate a parameter of a call - // node - NDLNode of the script - // nodeParam - NDLNode parameter we are evaluating - // baseName - name of the base node - // pass - which pass through the NDL nodes - // returns: the node that is the evaluated parameter - virtual NDLNode* EvaluateParameter(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseName, const NDLPass pass ) = 0; - - // EvaluateParameters - Evaluate the parameters of a call - // node - NDLNode we are evaluating paramters for - // baseName - baseName for the current node - // nodeParamStart - starting parameter that contains a node - // nodeParamCount - ending parameter that contains a node - // pass - NDL pass we are evaluating - // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator - virtual std::vector EvaluateParameters(NDLNode* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) = 0; - - // FindSymbol - Search the engines symbol table for a fully quantified symbol - // symbol - name of the symbol - // returns - pointer to the matching EvalValue for that node, of NULL if not found - virtual void* FindSymbol(const wstring& /*symbol*/) - { - return NULL; - } - // ProcessOptionalParameters - Process the optional parameters of a node - // node to process - virtual void ProcessOptionalParameters(NDLNode* /*node*/) - { - return; - } - -}; - -template class NDLNodeEvaluator; -template class NDLNodeEvaluator; - -template -class NetNdl // class to associate a network with an NDLScript -{ -public: - ComputationNetwork* cn; - NDLScript* ndl; // NDLScript we are using for this network. NOTE: the actual script used - NDLNode* lastNode[ndlPassMax]; // last node we evaluated for each pass - NetNdl(): cn(nullptr), ndl(nullptr) {ClearLastNodes();} - NetNdl(ComputationNetwork*p_cn): cn(p_cn), ndl(nullptr) {ClearLastNodes();} - NetNdl(ComputationNetwork*p_cn, NDLScript* p_ndl): cn(p_cn), ndl(p_ndl) {ClearLastNodes();} - ~NetNdl() - {} - - // ClearLastNodes - Clear out the last node values for all passes - void ClearLastNodes() - { - for (NDLPass pass=ndlPassInitial;pass < ndlPassMax;++pass) - { - lastNode[pass] = nullptr; - } - } - - // Clear - clear out everything in the structure - // NOTE: this deletes the network and the NDLScript, use with care! - void Clear() - { - delete cn; - delete ndl; - cn = nullptr; - ndl = nullptr; - ClearLastNodes(); - } -}; - -template -inline NDLNodeEvaluator::~NDLNodeEvaluator() { } // defined even though it's virtual; supposed to be faster this way - -// NDLNode - Network Description Language Node -// Used to represent a named entity in the NDL -// if a name is not provided (such as in nesting scenarios) one will be generated -template -class NDLNode -{ -private: - std::string m_name; // value on the left of the equals - ConfigValue m_value; // value on the right of the equals (CN node name, or value) - NDLScript* m_parent; // parent script - NDLType m_type; //type of node - ConfigArray m_paramString; // parameter of a function/array - ConfigArray m_paramMacro; // parameter of a macro (the variables used in the macro definition) - vector m_parameters; // parameters as nodes/array elements - void *m_eval; // pointer to an arbitrary eval structure - NDLScript* m_script; // script for ndlTypeMacro - static int s_nameCounter; // counter for generating unique names -public: - NDLNode(const std::string& name, ConfigValue value, NDLScript* parent, NDLType ndlType) - { - if (name.empty()) - GenerateName(); - else - m_name = name; - m_value = value; - m_parent = parent; - assert(parent != NULL); - parent->AddChild(this); - m_type = ndlType; - m_eval = NULL; - m_script = NULL; - } - - ~NDLNode() - {} - - // publicly accessible Copy method - // should only be used for macro expansion - NDLNode* Copy() const - { - NDLNode* ret = new NDLNode(*this); - return ret; - } - -private: - - // copy constructor, creates a new disconnected copy of this node for macro expansion - NDLNode(const NDLNode& copyMe); - - NDLNode& operator=(NDLNode& /*copyMe*/) //this is just a place holder implementation which is not functioning but prevent callers to use it. - { - throw std::logic_error("'NDLNode& operator=(NDLNode& copyMe)' should never be called."); - } - - // generate a generic symbol name for a node - void GenerateName() - { - char buffer[10]; - sprintf(buffer, "%d", ++s_nameCounter); - m_name = std::string("unnamed") + buffer; - } - -public: - void SetScript(NDLScript* script) {m_script = script;} - NDLScript* GetScript() const {return m_script;} - void SetType(NDLType type) {m_type = type;} - NDLType GetType() const {return m_type;} - const std::string& GetName() const {return m_name;} - void SetName(std::string &name) {m_name = name;} - ConfigValue GetValue() const {return m_value;} - void SetValue(std::string &value) {m_value = value;} - - // parameters of a function (ndlTypFunction), or parameters in the call to a macro - void SetParamString(ConfigValue paramString) {m_paramString = paramString;} - ConfigArray GetParamString() const {return m_paramString;} - - // parameters of a macro - void SetParamMacro(ConfigValue paramMacro) {m_paramMacro = paramMacro;} - ConfigArray GetParamMacro() const {return m_paramMacro;} - - void SetParentScript(NDLScript* script) {m_parent = script;} - NDLScript* GetParentScript() { return m_parent; } - - // get parameters, either just optional or just regular - vector GetParameters(bool optional=false) const - { - vector result; - for (NDLNode* param : m_parameters) - { - bool optParam = param->GetType() == ndlTypeOptionalParameter; - if (optParam == optional) - result.push_back(param); - } - return result; - } - - // Get/Set eval values - void* GetEvalValue() const { return m_eval;} - void SetEvalValue(void* evalValue) {m_eval = evalValue;} - - // GetOptionalParameter - Get an optional parameter value - // name - the name to search for in the optional parameters - // deflt - the default value (if not found) - // returns: parameter value if found, or default value otherwise - ConfigValue GetOptionalParameter(const std::string& name, const std::string& deflt) const - { - for (NDLNode* param : m_parameters) - { - bool optParam = param->GetType() == ndlTypeOptionalParameter; - if (optParam && !_stricmp(param->GetName().c_str(), name.c_str())) - { - return param->GetValue(); - } - } - return ConfigValue(deflt); - } - - // FindNode - Find a node of the given name - // name - name to search for - // searchForDotNames - search for NDL symbols traversing call heirarchy - // returns: The node with that name, or NULL if not found - NDLNode* FindNode(const std::string& name, bool searchForDotNames=false) - { - NDLNode* found = m_parent->FindSymbol(name, searchForDotNames); - if (!found) - found = NDLScript::GlobalScript().FindSymbol(name, searchForDotNames); - return found; - } - - // GetScalar - Get a scalar value from a node, may loop through some variables before arriving - // returns: scalar value - ConfigValue GetScalar() - { - NDLNode* node = this; - while (node && (node->GetType() == ndlTypeVariable || node->GetType() == ndlTypeParameter)) - { - NDLNode* nodeLast = node; - node = node->FindNode(node->GetValue(), true /*searchForDotNames*/); - - // if we are still on the same node, that means it was never resolved to anything, an undefined variable - if (nodeLast == node) - { - RuntimeError("undefined Variable, '%s' found, must be declared before first use\n", node->GetName().c_str()); - } - } - if (!node || node->GetType() != ndlTypeConstant) - { - std::string name = node ? node->GetName() : GetName(); - RuntimeError("Scalar expected, '%s' must be a constant or variable that resolves to a constant\n", name.c_str()); - } - return node->GetValue(); - } - - void InsertParam(NDLNode* param) {m_parameters.push_back(param);} - - // EvaluateMacro - Evaluate a macro, make the call - // nodeEval - the node evaluator we are using to interpret the script - // baseName - base name for all symbols at this level - // pass - what NDLPass are we in? - // returns: the return node for this macro - NDLNode* EvaluateMacro(NDLNodeEvaluator& nodeEval, const wstring& baseName, const NDLPass pass) - { - if (m_type != ndlTypeMacroCall) - return NULL; - - // make sure the actual parameters and expected parameters match - if (m_parameters.size() < m_paramMacro.size()) - { - RuntimeError("Parameter mismatch, %d parameters provided, %d expected in call to %s\n", - m_parameters.size(),m_paramMacro.size(),m_value.c_str()); - } - - // assign the actual parameters in the script so we can execute it - for (int i=0; i < m_parameters.size(); ++i) - { - NDLNode* nodeParam = m_parameters[i]; - std::string paramName = i < m_paramMacro.size()?m_paramMacro[i]:nodeParam->GetName(); - - // if the node is a parameter then look it up in the symbol table - if (nodeParam->GetType() == ndlTypeParameter) - { - nodeParam = m_parent->FindSymbol(nodeParam->GetName()); - } - // do we want to add optional parameters as symbols, or not? - else if (nodeParam->GetType() == ndlTypeOptionalParameter) - { - if (i < m_paramMacro.size()) - RuntimeError("Parameter mismatch, parameter %d is an optional parameter, but should be a required parameter\n",i); - // if no symbol yet, add it - if (!m_script->ExistsSymbol(paramName)) - { - m_script->AddSymbol(paramName, nodeParam); - continue; - } - //else assign the value below - } - - // assign the parameter symbols in the script we will call with the values passed to the call - m_script->AssignSymbol(paramName, nodeParam); - - } - - std::wstring newBase = baseName; - if (!newBase.empty()) - newBase += L"."; - newBase += msra::strfun::utf16(m_name); - - // now evaluate the contained macro script - NDLNode* nodeResult = m_script->Evaluate(nodeEval, newBase, pass); - // Consider: do we need to restore the original mapping here, may need to for recursive calls? - - // look for a symbol that is identical to the macro name, if it exists this is the return value - NDLNode* nodeMacroName = m_script->FindSymbol(m_value); - if (nodeMacroName) - { - nodeResult = nodeMacroName; - } - - // set the eval node to be the same as the return value; - if (nodeResult) - { - m_eval = nodeResult->GetEvalValue(); - } - return nodeResult; - } -}; - -template -class NDLScript: public ConfigParser -{ -private: - std::wstring m_baseName; - std::string m_scriptString; - std::vector*> m_script; // script lines in parsed node order, macros will have definition followed by body - std::map*, nocase_compare> m_symbols; // symbol table - NDLNode* m_macroNode; // set when interpretting a macro definition - bool m_noDefinitions; // no definitions can be made in this script, interpret all macro/function names as calls - static NDLScript s_global; //("global"); // global script for storing macros and global nodes - std::vector*> m_children; // child nodes. Note that m_script nodes may not be children of this object, they include macro nodes - ComputationNetwork* m_cn; // computation network to use for backup symbol lookup. Used for MEL where NDL and network nodes are mixed - bool m_definingMacro; // currently defining a macro, flag to determine if we are defining or interpretting a macro call - -public: - // constructors that take a config name - NDLScript(const std::string & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } - NDLScript(const std::wstring & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } - ~NDLScript() - { - // need to free all the child nodes attached to this script node - for (NDLNode* node : m_children) - { - delete node; - } - m_children.clear(); - } - - // empty constructor - NDLScript() : ConfigParser(';') { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } // parameterless version if needed - - // construct NDLScript from a ConfigValue, propogate the config Name - NDLScript(const ConfigValue& configValue) : ConfigParser(';',configValue.Name()) - { - m_macroNode = NULL; - m_noDefinitions=false; - m_definingMacro = false; - m_scriptString = configValue; - Parse(m_scriptString); - } - - // construct NDLScript from a ConfigValue, propogate the config Name - // configValue - the body of the macro - // oneLineDefinition - this macro definition is all on one line, names optional - // macroName - if the macro has a name, the name - this is used to get parameter info - NDLScript(const ConfigValue& configValue, std::string macroName, bool oneLineDefinition) : ConfigParser(';',configValue.Name()) - { - m_noDefinitions = oneLineDefinition; - m_definingMacro = true; - m_macroNode = NULL; - m_scriptString = configValue; - NDLNode* ndlNode = s_global.CheckName(macroName, true); - if (ndlNode == NULL) - RuntimeError("Invalid macro definition, %s not found", macroName.c_str()); - - // get and parse the parameters - ConfigArray parameters = ndlNode->GetParamMacro(); - for (auto iter = parameters.begin(); iter != parameters.end(); ++iter) - { - // we are adding parameters that will be replaced by actual values later - ConfigValue param = *iter; - - // check to make sure this parameter name is not a reserved word - std::string functionName = param; - // check for function name, a function may have two valid names - // in which case 'functionName' will get the default node name returned - if (CheckFunction(functionName)) - { - RuntimeError("NDLScript: Macro %s includes a parameter %s, which is also the name of a function. Parameter names may not be the same as function names.", macroName.c_str(), param.c_str()); - } - - NDLNode* paramNode = new NDLNode(param, param, this, ndlTypeParameter); - // add to node parameters - ndlNode->InsertParam(paramNode); - // add to script symbol table - AddSymbol(param, paramNode); - } - Parse(m_scriptString); - m_definingMacro = false; - } - - - // copy and move constructors - NDLScript(const NDLScript& copyMe); - NDLScript(const NDLScript&& moveMe); -private: - NDLNode* DuplicateNode(NDLNode* node); -public: - // GlobalScript - Access to global script - static NDLScript& GlobalScript() {return s_global;} - - // SetMacroDefinitionsAllowed - allow macro definitions - // macroAllowed - can macros be defined in this script? - void SetMacroDefinitionsAllowed(bool macroAllowed) - { - m_noDefinitions = !macroAllowed; - } - - void SetBaseName(const std::wstring& baseName) - { - m_baseName = baseName; - } - const std::wstring& GetBaseName() - { - return m_baseName; - } - - void ClearGlobal() - { - s_global.Clear(); - } - - void Clear() - { - - for (NDLNode* node : m_children) - { - delete node; - } - m_children.clear(); - for (NDLNode* node : m_script) - { - delete node; - } - m_script.clear(); - - m_symbols.clear(); - } - void ClearEvalValues() - { - for (NDLNode* node : m_children) - { - node->SetEvalValue(NULL); - } - } - // AddChild - add a child node to the script - // node - node to add - // NOTE: this NDLScript owns this node and is responsible to delete it - void AddChild(NDLNode* node) - { - m_children.push_back(node); - } - - // SetComputationNetwork - set the computation network this NDL is associated with - void SetComputationNetwork(ComputationNetwork* cn) - { - m_cn = cn; - } - - // FindSymbol - Find a symbol to the symbol table - // symbol - symbol to find - // searchForDotNames - search for NDL symbols traversing call heirarchy - // returns - node this symbol references - NDLNode* FindSymbol(const std::string& symbol, bool searchForDotNames=true) - { - auto found = m_symbols.find(symbol); //search symbol directly first - if (found != m_symbols.end()) - return found->second; - - // if not found, handle dot names by move up the hierarchy - size_t firstDot = symbol.find_first_of('.'); - if (firstDot == npos) - return nullptr; - - std::string search = symbol.substr(0,firstDot); - found = m_symbols.find(search); - if (found == m_symbols.end()) - { - return NULL; - } - - // handle dot names, - if (firstDot != npos) - { - NDLNode* node = found->second; - NDLScript* script = node->GetScript(); - // if there is no script, probably a parameter/variable with further 'dot' values (ie. var.CE.BFF) - if (script != NULL) - { - if (node->GetType() != ndlTypeMacroCall || script == NULL) - RuntimeError("Symbol name not valid, %s is not a macro, so %s cannot be interpretted",search.c_str(),symbol.c_str() ); - return script->FindSymbol(symbol.substr(firstDot+1), searchForDotNames); - } - } - return found->second; - } - - // ExistsSymbol - Find if a symbol exists (value might be NULL) - // symbol - symbol to find - // returns - true if it's there - bool ExistsSymbol(const std::string& symbol) - { - auto found = m_symbols.find(symbol); - return (found != m_symbols.end()); - } - - // ContainsOptionalParameter - do any nodes in this script have an optional parameter by the following name? - // optParamName - name of parameter we are searching for - // returns: vector of the nodes found (empty if nothing found) - vector*> ContainsOptionalParameter(const std::string& optParamName) - { - vector*> result; - std::string empty; - for (auto symbol : m_symbols) - { - NDLNode* node = symbol.second; - std::string value = node->GetOptionalParameter(optParamName, empty); - if (!value.empty()) - { - result.push_back(node); - } - } - return result; - } - - // AddSymbol - Add a symbol to the symbol table - // symbol - symbol to add - // node - node this symbol references - // NOTE: at present we don't allow reuse of a symbol, so this throws an error if it sees an existing symbol - void AddSymbol(const std::string& symbol, NDLNode* node) - { - auto found = m_symbols.find(symbol); - if (found != m_symbols.end()) - { - NDLNode* nodeFound = found->second; - // check for undetermined nodes, because these nodes are to be defined later - if (nodeFound->GetType() != ndlTypeUndetermined && nodeFound->GetType() != ndlTypeParameter) - { - std::string value = found->second->GetValue(); - RuntimeError("Symbol '%s' currently assigned to '%s' reassigning to a different value not allowed\n", symbol.c_str(), value.c_str()); - } - } - m_symbols[symbol] = node; - } - - // AssignSymbol - Assign a new value to a symbol in the table - // symbol - symbol to assign - // node - node this symbol will reference - void AssignSymbol(const std::string& symbol, NDLNode* node) - { - auto found = m_symbols.find(symbol); - if (found == m_symbols.end()) - { - RuntimeError("Symbol '%s' currently does not exist, attempting to assigned value '%s' AssignSymbol() requires existing symbol\n", symbol.c_str(), node->GetValue().c_str()); - } - m_symbols[symbol] = node; - } - - - // FileParse - parse at the file level, can be overridden for "section of file" behavior - // stringParse - file concatentated as a single string - void FileParse(const std::string& stringParse) - { - ConfigParameters sections(stringParse); - bool loadOrRunFound = false; - - // load all the sections that we want (macros) - if (sections.Exists("load")) - { - auto config = ConfigArray(sections("load")); - for (int i=0; i < config.size(); ++i) - { - Parse(sections(config[i])); - } - loadOrRunFound = true; - } - - // load and then execute - if (sections.Exists("run")) - { - auto config = ConfigArray(sections("run")); - for (int i=0; i < config.size(); ++i) - { - Parse(sections(config[i])); - } - loadOrRunFound = true; - } - - // didn't find any of the tags, so just parse the whole thing as a script - if (!loadOrRunFound) - { - // surround text in braces so we parse correctly - std::string textInBraces = "[ "+stringParse+" ]"; - Parse(textInBraces); - } - } - - // IsMacroDefinition - is this a macro definition? - // returns - true if a definition, otherwise false - bool IsMacroDefinition() - { - return m_definingMacro; - } - - // CheckName - check for a name in our symbols, see if it exists - // name - name we are looking for - // localOnly - only look in the current scope, and not the global scope - // if it does exist return the node that represents the name - NDLNode* CheckName(const std::string& name, bool localOnly=false) - { - // first try local script - auto found = FindSymbol(name); - if (found != NULL) - { - return found; - } - - // next try the globals, this includes macros and global constants - if (!localOnly) - { - auto found = s_global.FindSymbol(name); - if (found != NULL) - { - NDLNode* node = found; - if (node->GetType() == ndlTypeMacro) - { - // if we are calling a macro we need to keep track of formal parameters, - // keep them as strings in this macroCall node - NDLNode* newNode = new NDLNode("", name, this, ndlTypeMacroCall); - NDLScript* script = node->GetScript(); - - // if this is a macro call (and not a definition), we want to expand the macro (make a copy) - if (!IsMacroDefinition()) - { - script = new NDLScript(*script); - } - newNode->SetScript(script); - - newNode->SetParamMacro(node->GetParamMacro()); - node = newNode; - } - return node; - } - } - - std::string functionName = name; - // check for function name, a function may have two valid names - // in which case 'functionName' will get the default node name returned - if (CheckFunction(functionName)) - { - NDLNode* ndlNode = new NDLNode("", functionName, this, ndlTypeFunction); - return ndlNode; - } - - // not found, return NULL - return NULL; - } - - // CallStringParse - parse the string description of a call sequence - // token - [in] string description of the call - // nameFunction - [out] name of the function being called - // params - [out] parameters to the function, set to empty string if no parameters - // returns: the node (if it exists) that matches this function name, otherwise NULL - NDLNode* CallStringParse(const std::string& token, std::string& nameFunction, std::string& params) - { - auto paramStart = token.find_first_of(OPENBRACES); - if (paramStart == npos) - RuntimeError("Invalid macro/function call can not be parsed: %s\n", token.c_str()); - nameFunction = token.substr(0, paramStart); - Trim(nameFunction); - params = token.substr(paramStart); - NDLNode* ndlNodeFound = CheckName(nameFunction); - return ndlNodeFound; - } - - - // ParseParameters - parse the parameters of a macro, or an array - // ndlNode - node we should add the parameters to - // value - parameters as config value - // createNew - create a new parameter node if one does not exist - void ParseParameters(NDLNode* ndlNode, const ConfigValue& value, bool createNew=false) - { - ConfigArray parameters = value; - for (auto iter = parameters.begin(); iter != parameters.end(); ++iter) - { - ConfigValue param = *iter; - NDLNode* paramNode = NULL; - auto foundBrace = param.find_first_of(FUNCTIONOPEN); - if (foundBrace != npos) // a nested call as a parameter - paramNode = ParseCall(param); - else // must be predefined variable or constant - { - paramNode = ParseVariable(param, createNew); - - // if we can't find the node right now, it's undetermined, must be defined later, or throw an error later - if (paramNode == nullptr) - { - paramNode = new NDLNode(param, param, this, ndlTypeUndetermined); - // add to the symbol table - AddSymbol(param, paramNode); - } - } - if (paramNode == NULL) - { - RuntimeError("variable name '%s' not found, must be previously defined\n", param.c_str()); - } - else - { - ndlNode->InsertParam(paramNode); - } - } - } - - // ParseVariable - parse a variable or constant - // token - string containing the variable or constant - // createNew - create a new variable node if no node found - // returns: the node that represents this newly defined variable - NDLNode* ParseVariable(const std::string& token, bool createNew=true) - { - NDLNode* ndlNode = NULL; - auto openBrace = token.find_first_of(OPENBRACES); - if (openBrace == 0) - { - ndlNode = new NDLNode("", token, this, ndlTypeArray); - ndlNode->SetParamString(token); - ParseParameters(ndlNode, token); - return ndlNode; - } - - auto found = token.find_first_not_of("+-.0123456789eE"); - // see if it's a numeric constant - if (found == npos) - { - ndlNode = new NDLNode("", token, this, ndlTypeConstant); - } - // not a constant, so must be a variable - else - { - // look for an optional parameter - auto foundEqual = token.find_first_of('='); - bool optional = (foundEqual != npos); - if (optional) - { - std::string name = token.substr(0, foundEqual); - Trim(name); - std::string value = token.substr(foundEqual+1); - Trim(value); - - ndlNode = new NDLNode(name, value, this, ndlTypeOptionalParameter); - } - else - { - ndlNode = CheckName(token); - if (createNew && ndlNode == NULL) - { - // NOTE: currently we only get here in Parameter scenarios, - // if other scenarios present themselves, need a good way to change the type - ndlNode = new NDLNode(token, token, this, ndlTypeParameter); - AddSymbol(token, ndlNode); - } - } - } - return ndlNode; - } - - // ParseDefinition - parse a macro definition - // token - string containing the macro definition (without the macro body) - // returns: the node that represents this newly defined macro - NDLNode* ParseDefinition(const std::string& token) - { - std::string nameFunction, params; - NDLNode* ndlNode = CallStringParse(token, nameFunction, params); - if (ndlNode) - RuntimeError("function '%s' already defined\n", nameFunction.c_str()); - ndlNode = new NDLNode(nameFunction, params, &s_global, ndlTypeMacro); - - // now set the variables/parameters which will be parsed when the body shows up - ndlNode->SetParamMacro(params); - - // now add this to the globals - s_global.AddSymbol(nameFunction,ndlNode); - - // NOTE: the body of the Macro will be parsed separately, this just sets up the node - return ndlNode; - } - - // ParseCall - parse the call syntax out into "function" and variables - // token - string containing the "call" - // return - Node pointer, the newly created node - NDLNode* ParseCall(const std::string& token) - { - std::string nameFunction, params; - NDLNode* ndlNode = CallStringParse(token, nameFunction, params); - - if (ndlNode == NULL) - RuntimeError("Undefined function or macro '%s' in %s\n", nameFunction.c_str(), token.c_str()); - - // now setup the variables/parameters - ConfigValue value = ConfigValue(params, nameFunction); - - ndlNode->SetParamString(value); - ParseParameters(ndlNode, value); - return ndlNode; - } - - // parse a 'key=value' pair and create the appropriate node for what was seen - // 'key=Function(x,y,z)' - function - // 'macro(x,y)={z=Input(x,y)} - // may also be Function(x,y,z), a nameless call (used in one-line macros) - std::string::size_type ParseValue(const std::string& stringParse, std::string::size_type tokenStart, std::string::size_type tokenEnd) - { - // first find previous character - - // skip leading spaces - tokenStart = stringParse.find_first_not_of(" \t", tokenStart); - auto keyEnd = stringParse.find_first_of(OPENBRACES"=", tokenStart); - bool equalFound = (keyEnd != npos && keyEnd < tokenEnd && stringParse[keyEnd] == '='); - - // this should be the body of the macro - if (m_macroNode) - { - bool oneLineDefinition = false; - NDLNode* macroNode = m_macroNode; - - // an '=' at the beginning, skip it - if (keyEnd == tokenStart && equalFound) - { - // skip the '=' sign - oneLineDefinition = true; - tokenStart = stringParse.find_first_not_of(" \t", tokenStart+1); - if (tokenStart == npos) - RuntimeError("Body of Macro missing"); - } - - NDLScript* script = new NDLScript(ConfigValue(stringParse.substr(tokenStart, tokenEnd-tokenStart), macroNode->GetName()), macroNode->GetName(), oneLineDefinition); - macroNode->SetScript(script); - - // reset so we know we are done with the body - m_macroNode = NULL; - - return tokenEnd; // done with the macro now - } - - // if we hit the end of the token before we hit an equal sign, it's a 'macro(x,y)' definition - // unless we are a one-line macro in which case we don't allow definitions - if (!m_noDefinitions && !equalFound) - { - keyEnd = stringParse.find_first_of(OPENBRACES, tokenStart); - if (keyEnd == npos || keyEnd >= tokenEnd) - RuntimeError("Invalid statement, does not contain an '=' sign: %s\n", stringParse.substr(tokenStart, tokenEnd-tokenStart).c_str()); - m_macroNode = ParseDefinition(stringParse.substr(tokenStart, tokenEnd-tokenStart)); - // the body of the macro will come through next time - return tokenEnd; - } - - // get the key value (symbol name) - std::string key; - - // no macro definitions allowed, so no equal means a function call - if (m_noDefinitions && !equalFound) - { - ;// nothing to do here, just skip the "key=" parsing below - } - else - { - key = stringParse.substr(tokenStart, keyEnd-tokenStart); - Trim(key); - - // check to make sure variable name isn't a valid function name as well - string strTemp = key; - if (CheckFunction(strTemp)) - RuntimeError("variable %s is invalid, it is reserved because it is also the name of a function", key.c_str()); - - tokenStart = keyEnd; - if (stringParse[keyEnd] == '=') - ++tokenStart; - - // skip any spaces before the second token - tokenStart = stringParse.find_first_not_of(" \t", tokenStart); - } - std::string::size_type substrSize = tokenEnd - tokenStart; - - auto bracesEnd = FindBraces(stringParse, tokenStart); - - // if braces found, we modify the token end according to braces - if (bracesEnd != npos) - { // include the trailing brace - tokenEnd = bracesEnd+1; - substrSize = tokenEnd - tokenStart; - - // for quote delimited string remove quotes - if (stringParse[tokenStart] == '"') - { - tokenStart++; - substrSize -= 2; // take out the quotes - } - } - - if (substrSize == 0) - return npos; - - // get the value - std::string value = stringParse.substr(tokenStart, substrSize); - Trim(value); - - NDLNode* ndlNode = NULL; - - // check for a function/macro call - auto found = value.find_first_of(FUNCTIONOPEN); - if (found != npos && found > 0) // brace found after some text, so a call - { - ndlNode = ParseCall(value); - // check if we have a user defined name, ParseCall assigns a default name - if (!key.empty()) - ndlNode->SetName(key); - AddSymbol(ndlNode->GetName(),ndlNode); - m_script.push_back(ndlNode); - } - // if it's not a call, must be a variable - else - { - ndlNode = ParseVariable(value); - bool newNode = ndlNode->GetName().empty(); - AddSymbol(key,ndlNode); - - ndlNode->SetName(key); - if (newNode) //only need to add nodes that are new (not renames) - { - m_script.push_back(ndlNode); - } - } - - return tokenEnd; - } - - // ExpandMacro - Expand a macro into a new macro definition - // node - NDLNode that holds the macro call - // returns: new node with the expanded macro - NDLNode* ExpandMacro(const NDLNode* node) - { - assert(node->GetType() == ndlTypeMacroCall); // needs to be a macro call (not definition) - - std::string name = node->GetName(); - // if we are calling a macro make a new copy of it and execute that instead (macro expansion) - // we do this so the evalValues in the macros will be valid regardless of number of instantiations - NDLNode* newNode = new NDLNode(name, node->GetValue(), this, ndlTypeMacroCall); - NDLScript* newScript = new NDLScript(*node->GetScript()); - newNode->SetScript(newScript); - newNode->SetParamMacro(node->GetParamMacro()); - - // now get the parameters to the macro added - ConfigValue paramString = node->GetParamString(); - ParseParameters(newNode, paramString, true /*createNew*/); - newNode->SetParamString(paramString); - - // fixup the symbol table to point to this one instead - AssignSymbol(name, newNode); - return newNode; - } - - // Evaluate - Evaluate the script - // nodeEval - the node evaluator to call - // baseName - baseName for all labels - // pass - what NDLPass are we on? - // skipThrough - skip through this node, will skip eval for all nodes up to and including this one - NDLNode* Evaluate(NDLNodeEvaluator& nodeEval, const wstring& baseName, const NDLPass pass=ndlPassInitial, NDLNode* skipThrough=nullptr) - { - NDLNode* nodeLast = skipThrough; - bool skip = skipThrough != nullptr; - std::wstring prevBaseName = GetBaseName(); - SetBaseName(baseName); - - for (auto& node : m_script) - { - // if we are in skip mode, and we found the skipThrough node, - // move out of skip mode and start processing at next node - if (skip) - { - if (node == skipThrough) - skip = false; - continue; - } - - // if it's a macro call, call the macro - if (node->GetType() == ndlTypeMacroCall) - { - node->EvaluateMacro(nodeEval, baseName, pass); - nodeEval.ProcessOptionalParameters(node); - } - else - { - nodeEval.Evaluate(node, baseName, pass); - } - nodeLast = node; - } - SetBaseName(prevBaseName); - return nodeLast; - } -}; - -}}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#pragma once +#include "commandArgUtil.h" +#include "ComputationNode.h" +#include "TrainingCriterionNode.h" +#include "CompositeComputationNode.h" +#include "EvaluationCriterionNode.h" +#include "ComputationNetwork.h" +#include + +namespace Microsoft { namespace MSR { namespace CNTK { + +// EqualInsensitive - check to see if two nodes are equal up to the length of the first string (must be at least half as long as actual node name) +// string1 - [in,out] string to compare, if comparision is equal insensitive but not sensitive, will replace with sensitive version +// string2 - second string to compare +// alternate - alternate naming of the string +// return - true if strings are equal insensitive and modifies string1 to sensitive version if different +bool EqualInsensitive(std::wstring& string1, const std::wstring& string2, const wchar_t* alternate=NULL); + +// CheckFunction - check to see if we match a function name +// string1 - [in,out] string to compare, if comparision is equal and at least half the full node name will replace with full node name +// allowUndeterminedVariable - [out] set to true if undetermined variables (symbols yet to be defined) are allowed here +// return - true if function name found +template +bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable=nullptr); + +// NDLType - Network Description Language node type +enum NDLType +{ + ndlTypeNull, + ndlTypeConstant, + ndlTypeFunction, + ndlTypeVariable, + ndlTypeParameter, // parameter value, must be looked up to get actual value + ndlTypeUndetermined, // an undetermined value that will later be resolved + ndlTypeOptionalParameter, + ndlTypeArray, + ndlTypeMacroCall, // calling a macro + ndlTypeMacro, // definition of a macro + ndlTypeMax +}; + +// NDLPass - enumeration for the number of passes through the NDL parser +enum NDLPass +{ + ndlPassInitial, // inital pass, create nodes + ndlPassResolve, // resolve any undetermined symbols (variables that were not yet declared in NDL) + ndlPassFinal, // final pass done post-validation (when all matrices are allocated to the correct size) + ndlPassAll = ndlPassFinal, // all passes, used as flag in NDLUtil.h + ndlPassMax // number of NDLPasses +}; + +// ++ operator for this enum, so loops work +NDLPass &operator++(NDLPass &ndlPass); + +// Predeclaration of Script and Node +template +class NDLScript; + +template +class NDLNode; + +// NDLNodeEvaluator - Node evaluaton interface +// implemented by execution engines to convert script to approriate internal formats +template +class NDLNodeEvaluator +{ +public: + virtual void Evaluate(NDLNode* node, const wstring& baseName, const NDLPass pass) = 0; + virtual ~NDLNodeEvaluator() = 0; + + // EvaluateParameter - Evaluate a parameter of a call + // node - NDLNode of the script + // nodeParam - NDLNode parameter we are evaluating + // baseName - name of the base node + // pass - which pass through the NDL nodes + // returns: the node that is the evaluated parameter + virtual NDLNode* EvaluateParameter(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseName, const NDLPass pass ) = 0; + + // EvaluateParameters - Evaluate the parameters of a call + // node - NDLNode we are evaluating paramters for + // baseName - baseName for the current node + // nodeParamStart - starting parameter that contains a node + // nodeParamCount - ending parameter that contains a node + // pass - NDL pass we are evaluating + // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator + virtual std::vector EvaluateParameters(NDLNode* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) = 0; + + // FindSymbol - Search the engines symbol table for a fully quantified symbol + // symbol - name of the symbol + // returns - pointer to the matching EvalValue for that node, of NULL if not found + virtual void* FindSymbol(const wstring& /*symbol*/) + { + return NULL; + } + // ProcessOptionalParameters - Process the optional parameters of a node + // node to process + virtual void ProcessOptionalParameters(NDLNode* /*node*/) + { + return; + } + +}; + +template class NDLNodeEvaluator; +template class NDLNodeEvaluator; + +template +class NetNdl // class to associate a network with an NDLScript +{ +public: + ComputationNetwork* cn; + NDLScript* ndl; // NDLScript we are using for this network. NOTE: the actual script used + NDLNode* lastNode[ndlPassMax]; // last node we evaluated for each pass + NetNdl(): cn(nullptr), ndl(nullptr) {ClearLastNodes();} + NetNdl(ComputationNetwork*p_cn): cn(p_cn), ndl(nullptr) {ClearLastNodes();} + NetNdl(ComputationNetwork*p_cn, NDLScript* p_ndl): cn(p_cn), ndl(p_ndl) {ClearLastNodes();} + ~NetNdl() + {} + + // ClearLastNodes - Clear out the last node values for all passes + void ClearLastNodes() + { + for (NDLPass pass=ndlPassInitial;pass < ndlPassMax;++pass) + { + lastNode[pass] = nullptr; + } + } + + // Clear - clear out everything in the structure + // NOTE: this deletes the network and the NDLScript, use with care! + void Clear() + { + delete cn; + delete ndl; + cn = nullptr; + ndl = nullptr; + ClearLastNodes(); + } +}; + +template +inline NDLNodeEvaluator::~NDLNodeEvaluator() { } // defined even though it's virtual; supposed to be faster this way + +// NDLNode - Network Description Language Node +// Used to represent a named entity in the NDL +// if a name is not provided (such as in nesting scenarios) one will be generated +template +class NDLNode +{ +private: + std::string m_name; // value on the left of the equals + ConfigValue m_value; // value on the right of the equals (CN node name, or value) + NDLScript* m_parent; // parent script + NDLType m_type; //type of node + ConfigArray m_paramString; // parameter of a function/array + ConfigArray m_paramMacro; // parameter of a macro (the variables used in the macro definition) + vector m_parameters; // parameters as nodes/array elements + void *m_eval; // pointer to an arbitrary eval structure + NDLScript* m_script; // script for ndlTypeMacro + static int s_nameCounter; // counter for generating unique names +public: + NDLNode(const std::string& name, ConfigValue value, NDLScript* parent, NDLType ndlType) + { + if (name.empty()) + GenerateName(); + else + m_name = name; + m_value = value; + m_parent = parent; + assert(parent != NULL); + parent->AddChild(this); + m_type = ndlType; + m_eval = NULL; + m_script = NULL; + } + + ~NDLNode() + {} + + // publicly accessible Copy method + // should only be used for macro expansion + NDLNode* Copy() const + { + NDLNode* ret = new NDLNode(*this); + return ret; + } + +private: + + // copy constructor, creates a new disconnected copy of this node for macro expansion + NDLNode(const NDLNode& copyMe); + + NDLNode& operator=(NDLNode& /*copyMe*/) //this is just a place holder implementation which is not functioning but prevent callers to use it. + { + throw std::logic_error("'NDLNode& operator=(NDLNode& copyMe)' should never be called."); + } + + // generate a generic symbol name for a node + void GenerateName() + { + char buffer[10]; + sprintf(buffer, "%d", ++s_nameCounter); + m_name = std::string("unnamed") + buffer; + } + +public: + void SetScript(NDLScript* script) {m_script = script;} + NDLScript* GetScript() const {return m_script;} + void SetType(NDLType type) {m_type = type;} + NDLType GetType() const {return m_type;} + const std::string& GetName() const {return m_name;} + void SetName(std::string &name) {m_name = name;} + ConfigValue GetValue() const {return m_value;} + void SetValue(std::string &value) {m_value = value;} + + // parameters of a function (ndlTypFunction), or parameters in the call to a macro + void SetParamString(ConfigValue paramString) {m_paramString = paramString;} + ConfigArray GetParamString() const {return m_paramString;} + + // parameters of a macro + void SetParamMacro(ConfigValue paramMacro) {m_paramMacro = paramMacro;} + ConfigArray GetParamMacro() const {return m_paramMacro;} + + void SetParentScript(NDLScript* script) {m_parent = script;} + NDLScript* GetParentScript() { return m_parent; } + + // get parameters, either just optional or just regular + vector GetParameters(bool optional=false) const + { + vector result; + for (NDLNode* param : m_parameters) + { + bool optParam = param->GetType() == ndlTypeOptionalParameter; + if (optParam == optional) + result.push_back(param); + } + return result; + } + + // Get/Set eval values + void* GetEvalValue() const { return m_eval;} + void SetEvalValue(void* evalValue) {m_eval = evalValue;} + + // GetOptionalParameter - Get an optional parameter value + // name - the name to search for in the optional parameters + // deflt - the default value (if not found) + // returns: parameter value if found, or default value otherwise + ConfigValue GetOptionalParameter(const std::string& name, const std::string& deflt) const + { + for (NDLNode* param : m_parameters) + { + bool optParam = param->GetType() == ndlTypeOptionalParameter; + if (optParam && !_stricmp(param->GetName().c_str(), name.c_str())) + { + auto paramValue = param->GetValue(); + auto resolveParamNode = m_parent->ParseVariable(paramValue, false); + if (resolveParamNode != nullptr) + return resolveParamNode->GetScalar(); + else + return paramValue; + } + } + return ConfigValue(deflt); + } + + // FindNode - Find a node of the given name + // name - name to search for + // searchForDotNames - search for NDL symbols traversing call heirarchy + // returns: The node with that name, or NULL if not found + NDLNode* FindNode(const std::string& name, bool searchForDotNames=false) + { + NDLNode* found = m_parent->FindSymbol(name, searchForDotNames); + if (!found) + found = NDLScript::GlobalScript().FindSymbol(name, searchForDotNames); + return found; + } + + // GetScalar - Get a scalar value from a node, may loop through some variables before arriving + // returns: scalar value + ConfigValue GetScalar() + { + NDLNode* node = this; + while (node && (node->GetType() == ndlTypeVariable || node->GetType() == ndlTypeParameter)) + { + NDLNode* nodeLast = node; + node = node->FindNode(node->GetValue(), true /*searchForDotNames*/); + + // if we are still on the same node, that means it was never resolved to anything, an undefined variable + if (nodeLast == node) + { + RuntimeError("undefined Variable, '%s' found, must be declared before first use\n", node->GetName().c_str()); + } + } + if (!node || node->GetType() != ndlTypeConstant) + { + std::string name = node ? node->GetName() : GetName(); + RuntimeError("Scalar expected, '%s' must be a constant or variable that resolves to a constant\n", name.c_str()); + } + return node->GetValue(); + } + + void InsertParam(NDLNode* param) {m_parameters.push_back(param);} + + // EvaluateMacro - Evaluate a macro, make the call + // nodeEval - the node evaluator we are using to interpret the script + // baseName - base name for all symbols at this level + // pass - what NDLPass are we in? + // returns: the return node for this macro + NDLNode* EvaluateMacro(NDLNodeEvaluator& nodeEval, const wstring& baseName, const NDLPass pass) + { + if (m_type != ndlTypeMacroCall) + return NULL; + + // make sure the actual parameters and expected parameters match + if (m_parameters.size() < m_paramMacro.size()) + { + RuntimeError("Parameter mismatch, %d parameters provided, %d expected in call to %s\n", + m_parameters.size(),m_paramMacro.size(),m_value.c_str()); + } + + // assign the actual parameters in the script so we can execute it + for (int i=0; i < m_parameters.size(); ++i) + { + NDLNode* nodeParam = m_parameters[i]; + std::string paramName = i < m_paramMacro.size()?m_paramMacro[i]:nodeParam->GetName(); + + // if the node is a parameter then look it up in the symbol table + if (nodeParam->GetType() == ndlTypeParameter) + { + nodeParam = m_parent->FindSymbol(nodeParam->GetName()); + } + // do we want to add optional parameters as symbols, or not? + else if (nodeParam->GetType() == ndlTypeOptionalParameter) + { + if (i < m_paramMacro.size()) + RuntimeError("Parameter mismatch, parameter %d is an optional parameter, but should be a required parameter\n",i); + // if no symbol yet, add it + if (!m_script->ExistsSymbol(paramName)) + { + m_script->AddSymbol(paramName, nodeParam); + continue; + } + //else assign the value below + } + + // assign the parameter symbols in the script we will call with the values passed to the call + m_script->AssignSymbol(paramName, nodeParam); + + } + + std::wstring newBase = baseName; + if (!newBase.empty()) + newBase += L"."; + newBase += msra::strfun::utf16(m_name); + + // now evaluate the contained macro script + NDLNode* nodeResult = m_script->Evaluate(nodeEval, newBase, pass); + // Consider: do we need to restore the original mapping here, may need to for recursive calls? + + // look for a symbol that is identical to the macro name, if it exists this is the return value + NDLNode* nodeMacroName = m_script->FindSymbol(m_value); + if (nodeMacroName) + { + nodeResult = nodeMacroName; + } + + // set the eval node to be the same as the return value; + if (nodeResult) + { + m_eval = nodeResult->GetEvalValue(); + } + return nodeResult; + } +}; + +template +class NDLScript: public ConfigParser +{ +private: + std::wstring m_baseName; + std::string m_scriptString; + std::vector*> m_script; // script lines in parsed node order, macros will have definition followed by body + std::map*, nocase_compare> m_symbols; // symbol table + NDLNode* m_macroNode; // set when interpretting a macro definition + bool m_noDefinitions; // no definitions can be made in this script, interpret all macro/function names as calls + static NDLScript s_global; //("global"); // global script for storing macros and global nodes + std::vector*> m_children; // child nodes. Note that m_script nodes may not be children of this object, they include macro nodes + ComputationNetwork* m_cn; // computation network to use for backup symbol lookup. Used for MEL where NDL and network nodes are mixed + bool m_definingMacro; // currently defining a macro, flag to determine if we are defining or interpretting a macro call + +public: + // constructors that take a config name + NDLScript(const std::string & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } + NDLScript(const std::wstring & configname) : ConfigParser(';', configname) { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } + ~NDLScript() + { + // need to free all the child nodes attached to this script node + for (NDLNode* node : m_children) + { + delete node; + } + m_children.clear(); + } + + // empty constructor + NDLScript() : ConfigParser(';') { m_macroNode = NULL; m_noDefinitions = false; m_definingMacro = false; } // parameterless version if needed + + // construct NDLScript from a ConfigValue, propogate the config Name + NDLScript(const ConfigValue& configValue) : ConfigParser(';',configValue.Name()) + { + m_macroNode = NULL; + m_noDefinitions=false; + m_definingMacro = false; + m_scriptString = configValue; + Parse(m_scriptString); + } + + // construct NDLScript from a ConfigValue, propogate the config Name + // configValue - the body of the macro + // oneLineDefinition - this macro definition is all on one line, names optional + // macroName - if the macro has a name, the name - this is used to get parameter info + NDLScript(const ConfigValue& configValue, std::string macroName, bool oneLineDefinition) : ConfigParser(';',configValue.Name()) + { + m_noDefinitions = oneLineDefinition; + m_definingMacro = true; + m_macroNode = NULL; + m_scriptString = configValue; + NDLNode* ndlNode = s_global.CheckName(macroName, true); + if (ndlNode == NULL) + RuntimeError("Invalid macro definition, %s not found", macroName.c_str()); + + // get and parse the parameters + ConfigArray parameters = ndlNode->GetParamMacro(); + for (auto iter = parameters.begin(); iter != parameters.end(); ++iter) + { + // we are adding parameters that will be replaced by actual values later + ConfigValue param = *iter; + + // check to make sure this parameter name is not a reserved word + std::string functionName = param; + // check for function name, a function may have two valid names + // in which case 'functionName' will get the default node name returned + if (CheckFunction(functionName)) + { + RuntimeError("NDLScript: Macro %s includes a parameter %s, which is also the name of a function. Parameter names may not be the same as function names.", macroName.c_str(), param.c_str()); + } + + NDLNode* paramNode = new NDLNode(param, param, this, ndlTypeParameter); + // add to node parameters + ndlNode->InsertParam(paramNode); + // add to script symbol table + AddSymbol(param, paramNode); + } + Parse(m_scriptString); + m_definingMacro = false; + } + + + // copy and move constructors + NDLScript(const NDLScript& copyMe); + NDLScript(const NDLScript&& moveMe); +private: + NDLNode* DuplicateNode(NDLNode* node); +public: + // GlobalScript - Access to global script + static NDLScript& GlobalScript() {return s_global;} + + // SetMacroDefinitionsAllowed - allow macro definitions + // macroAllowed - can macros be defined in this script? + void SetMacroDefinitionsAllowed(bool macroAllowed) + { + m_noDefinitions = !macroAllowed; + } + + void SetBaseName(const std::wstring& baseName) + { + m_baseName = baseName; + } + const std::wstring& GetBaseName() + { + return m_baseName; + } + + void ClearGlobal() + { + s_global.Clear(); + } + + void Clear() + { + + for (NDLNode* node : m_children) + { + delete node; + } + m_children.clear(); + for (NDLNode* node : m_script) + { + delete node; + } + m_script.clear(); + + m_symbols.clear(); + } + void ClearEvalValues() + { + for (NDLNode* node : m_children) + { + node->SetEvalValue(NULL); + } + } + // AddChild - add a child node to the script + // node - node to add + // NOTE: this NDLScript owns this node and is responsible to delete it + void AddChild(NDLNode* node) + { + m_children.push_back(node); + } + + // SetComputationNetwork - set the computation network this NDL is associated with + void SetComputationNetwork(ComputationNetwork* cn) + { + m_cn = cn; + } + + // FindSymbol - Find a symbol to the symbol table + // symbol - symbol to find + // searchForDotNames - search for NDL symbols traversing call heirarchy + // returns - node this symbol references + NDLNode* FindSymbol(const std::string& symbol, bool searchForDotNames=true) + { + auto found = m_symbols.find(symbol); //search symbol directly first + if (found != m_symbols.end()) + return found->second; + + // if not found, handle dot names by move up the hierarchy + size_t firstDot = symbol.find_first_of('.'); + if (firstDot == npos) + return nullptr; + + std::string search = symbol.substr(0,firstDot); + found = m_symbols.find(search); + if (found == m_symbols.end()) + { + return NULL; + } + + // handle dot names, + if (firstDot != npos) + { + NDLNode* node = found->second; + NDLScript* script = node->GetScript(); + // if there is no script, probably a parameter/variable with further 'dot' values (ie. var.CE.BFF) + if (script != NULL) + { + if (node->GetType() != ndlTypeMacroCall || script == NULL) + RuntimeError("Symbol name not valid, %s is not a macro, so %s cannot be interpretted",search.c_str(),symbol.c_str() ); + return script->FindSymbol(symbol.substr(firstDot+1), searchForDotNames); + } + } + return found->second; + } + + // ExistsSymbol - Find if a symbol exists (value might be NULL) + // symbol - symbol to find + // returns - true if it's there + bool ExistsSymbol(const std::string& symbol) + { + auto found = m_symbols.find(symbol); + return (found != m_symbols.end()); + } + + // ContainsOptionalParameter - do any nodes in this script have an optional parameter by the following name? + // optParamName - name of parameter we are searching for + // returns: vector of the nodes found (empty if nothing found) + vector*> ContainsOptionalParameter(const std::string& optParamName) + { + vector*> result; + std::string empty; + for (auto symbol : m_symbols) + { + NDLNode* node = symbol.second; + std::string value = node->GetOptionalParameter(optParamName, empty); + if (!value.empty()) + { + result.push_back(node); + } + } + return result; + } + + // AddSymbol - Add a symbol to the symbol table + // symbol - symbol to add + // node - node this symbol references + // NOTE: at present we don't allow reuse of a symbol, so this throws an error if it sees an existing symbol + void AddSymbol(const std::string& symbol, NDLNode* node) + { + auto found = m_symbols.find(symbol); + if (found != m_symbols.end()) + { + NDLNode* nodeFound = found->second; + // check for undetermined nodes, because these nodes are to be defined later + if (nodeFound->GetType() != ndlTypeUndetermined && nodeFound->GetType() != ndlTypeParameter) + { + std::string value = found->second->GetValue(); + RuntimeError("Symbol '%s' currently assigned to '%s' reassigning to a different value not allowed\n", symbol.c_str(), value.c_str()); + } + } + m_symbols[symbol] = node; + } + + // AssignSymbol - Assign a new value to a symbol in the table + // symbol - symbol to assign + // node - node this symbol will reference + void AssignSymbol(const std::string& symbol, NDLNode* node) + { + auto found = m_symbols.find(symbol); + if (found == m_symbols.end()) + { + RuntimeError("Symbol '%s' currently does not exist, attempting to assigned value '%s' AssignSymbol() requires existing symbol\n", symbol.c_str(), node->GetValue().c_str()); + } + m_symbols[symbol] = node; + } + + + // FileParse - parse at the file level, can be overridden for "section of file" behavior + // stringParse - file concatentated as a single string + void FileParse(const std::string& stringParse) + { + ConfigParameters sections(stringParse); + bool loadOrRunFound = false; + + // load all the sections that we want (macros) + if (sections.Exists("load")) + { + auto config = ConfigArray(sections("load")); + for (int i=0; i < config.size(); ++i) + { + Parse(sections(config[i])); + } + loadOrRunFound = true; + } + + // load and then execute + if (sections.Exists("run")) + { + auto config = ConfigArray(sections("run")); + for (int i=0; i < config.size(); ++i) + { + Parse(sections(config[i])); + } + loadOrRunFound = true; + } + + // didn't find any of the tags, so just parse the whole thing as a script + if (!loadOrRunFound) + { + // surround text in braces so we parse correctly + std::string textInBraces = "[ "+stringParse+" ]"; + Parse(textInBraces); + } + } + + // IsMacroDefinition - is this a macro definition? + // returns - true if a definition, otherwise false + bool IsMacroDefinition() + { + return m_definingMacro; + } + + // CheckName - check for a name in our symbols, see if it exists + // name - name we are looking for + // localOnly - only look in the current scope, and not the global scope + // if it does exist return the node that represents the name + NDLNode* CheckName(const std::string& name, bool localOnly=false) + { + // first try local script + auto found = FindSymbol(name); + if (found != NULL) + { + return found; + } + + // next try the globals, this includes macros and global constants + if (!localOnly) + { + auto found = s_global.FindSymbol(name); + if (found != NULL) + { + NDLNode* node = found; + if (node->GetType() == ndlTypeMacro) + { + // if we are calling a macro we need to keep track of formal parameters, + // keep them as strings in this macroCall node + NDLNode* newNode = new NDLNode("", name, this, ndlTypeMacroCall); + NDLScript* script = node->GetScript(); + + // if this is a macro call (and not a definition), we want to expand the macro (make a copy) + if (!IsMacroDefinition()) + { + script = new NDLScript(*script); + } + newNode->SetScript(script); + + newNode->SetParamMacro(node->GetParamMacro()); + node = newNode; + } + return node; + } + } + + std::string functionName = name; + // check for function name, a function may have two valid names + // in which case 'functionName' will get the default node name returned + if (CheckFunction(functionName)) + { + NDLNode* ndlNode = new NDLNode("", functionName, this, ndlTypeFunction); + return ndlNode; + } + + // not found, return NULL + return NULL; + } + + // CallStringParse - parse the string description of a call sequence + // token - [in] string description of the call + // nameFunction - [out] name of the function being called + // params - [out] parameters to the function, set to empty string if no parameters + // returns: the node (if it exists) that matches this function name, otherwise NULL + NDLNode* CallStringParse(const std::string& token, std::string& nameFunction, std::string& params) + { + auto paramStart = token.find_first_of(OPENBRACES); + if (paramStart == npos) + RuntimeError("Invalid macro/function call can not be parsed: %s\n", token.c_str()); + nameFunction = token.substr(0, paramStart); + Trim(nameFunction); + params = token.substr(paramStart); + NDLNode* ndlNodeFound = CheckName(nameFunction); + return ndlNodeFound; + } + + + // ParseParameters - parse the parameters of a macro, or an array + // ndlNode - node we should add the parameters to + // value - parameters as config value + // createNew - create a new parameter node if one does not exist + void ParseParameters(NDLNode* ndlNode, const ConfigValue& value, bool createNew=false) + { + ConfigArray parameters = value; + for (auto iter = parameters.begin(); iter != parameters.end(); ++iter) + { + ConfigValue param = *iter; + NDLNode* paramNode = NULL; + auto foundBrace = param.find_first_of(FUNCTIONOPEN); + if (foundBrace != npos) // a nested call as a parameter + paramNode = ParseCall(param); + else // must be predefined variable or constant + { + paramNode = ParseVariable(param, createNew); + + // if we can't find the node right now, it's undetermined, must be defined later, or throw an error later + if (paramNode == nullptr) + { + paramNode = new NDLNode(param, param, this, ndlTypeUndetermined); + // add to the symbol table + AddSymbol(param, paramNode); + } + } + if (paramNode == NULL) + { + RuntimeError("variable name '%s' not found, must be previously defined\n", param.c_str()); + } + else + { + ndlNode->InsertParam(paramNode); + } + } + } + + // ParseVariable - parse a variable or constant + // token - string containing the variable or constant + // createNew - create a new variable node if no node found + // returns: the node that represents this newly defined variable + NDLNode* ParseVariable(const std::string& token, bool createNew=true) + { + NDLNode* ndlNode = NULL; + auto openBrace = token.find_first_of(OPENBRACES); + if (openBrace == 0) + { + ndlNode = new NDLNode("", token, this, ndlTypeArray); + ndlNode->SetParamString(token); + ParseParameters(ndlNode, token); + return ndlNode; + } + + auto found = token.find_first_not_of("+-.0123456789eE"); + // see if it's a numeric constant + if (found == npos) + { + ndlNode = new NDLNode("", token, this, ndlTypeConstant); + } + // not a constant, so must be a variable + else + { + // look for an optional parameter + auto foundEqual = token.find_first_of('='); + bool optional = (foundEqual != npos); + if (optional) + { + std::string name = token.substr(0, foundEqual); + Trim(name); + std::string value = token.substr(foundEqual+1); + Trim(value); + + ndlNode = new NDLNode(name, value, this, ndlTypeOptionalParameter); + } + else + { + ndlNode = CheckName(token); + if (createNew && ndlNode == NULL) + { + // NOTE: currently we only get here in Parameter scenarios, + // if other scenarios present themselves, need a good way to change the type + ndlNode = new NDLNode(token, token, this, ndlTypeParameter); + AddSymbol(token, ndlNode); + } + } + } + return ndlNode; + } + + // ParseDefinition - parse a macro definition + // token - string containing the macro definition (without the macro body) + // returns: the node that represents this newly defined macro + NDLNode* ParseDefinition(const std::string& token) + { + std::string nameFunction, params; + NDLNode* ndlNode = CallStringParse(token, nameFunction, params); + if (ndlNode) + RuntimeError("function '%s' already defined\n", nameFunction.c_str()); + ndlNode = new NDLNode(nameFunction, params, &s_global, ndlTypeMacro); + + // now set the variables/parameters which will be parsed when the body shows up + ndlNode->SetParamMacro(params); + + // now add this to the globals + s_global.AddSymbol(nameFunction,ndlNode); + + // NOTE: the body of the Macro will be parsed separately, this just sets up the node + return ndlNode; + } + + // ParseCall - parse the call syntax out into "function" and variables + // token - string containing the "call" + // return - Node pointer, the newly created node + NDLNode* ParseCall(const std::string& token) + { + std::string nameFunction, params; + NDLNode* ndlNode = CallStringParse(token, nameFunction, params); + + if (ndlNode == NULL) + RuntimeError("Undefined function or macro '%s' in %s\n", nameFunction.c_str(), token.c_str()); + + // now setup the variables/parameters + ConfigValue value = ConfigValue(params, nameFunction); + + ndlNode->SetParamString(value); + ParseParameters(ndlNode, value); + return ndlNode; + } + + // parse a 'key=value' pair and create the appropriate node for what was seen + // 'key=Function(x,y,z)' - function + // 'macro(x,y)={z=Input(x,y)} + // may also be Function(x,y,z), a nameless call (used in one-line macros) + std::string::size_type ParseValue(const std::string& stringParse, std::string::size_type tokenStart, std::string::size_type tokenEnd) + { + // first find previous character + + // skip leading spaces + tokenStart = stringParse.find_first_not_of(" \t", tokenStart); + auto keyEnd = stringParse.find_first_of(OPENBRACES"=", tokenStart); + bool equalFound = (keyEnd != npos && keyEnd < tokenEnd && stringParse[keyEnd] == '='); + + // this should be the body of the macro + if (m_macroNode) + { + bool oneLineDefinition = false; + NDLNode* macroNode = m_macroNode; + + // an '=' at the beginning, skip it + if (keyEnd == tokenStart && equalFound) + { + // skip the '=' sign + oneLineDefinition = true; + tokenStart = stringParse.find_first_not_of(" \t", tokenStart+1); + if (tokenStart == npos) + RuntimeError("Body of Macro missing"); + } + + NDLScript* script = new NDLScript(ConfigValue(stringParse.substr(tokenStart, tokenEnd-tokenStart), macroNode->GetName()), macroNode->GetName(), oneLineDefinition); + macroNode->SetScript(script); + + // reset so we know we are done with the body + m_macroNode = NULL; + + return tokenEnd; // done with the macro now + } + + // if we hit the end of the token before we hit an equal sign, it's a 'macro(x,y)' definition + // unless we are a one-line macro in which case we don't allow definitions + if (!m_noDefinitions && !equalFound) + { + keyEnd = stringParse.find_first_of(OPENBRACES, tokenStart); + if (keyEnd == npos || keyEnd >= tokenEnd) + RuntimeError("Invalid statement, does not contain an '=' sign: %s\n", stringParse.substr(tokenStart, tokenEnd-tokenStart).c_str()); + m_macroNode = ParseDefinition(stringParse.substr(tokenStart, tokenEnd-tokenStart)); + // the body of the macro will come through next time + return tokenEnd; + } + + // get the key value (symbol name) + std::string key; + + // no macro definitions allowed, so no equal means a function call + if (m_noDefinitions && !equalFound) + { + ;// nothing to do here, just skip the "key=" parsing below + } + else + { + key = stringParse.substr(tokenStart, keyEnd-tokenStart); + Trim(key); + + // check to make sure variable name isn't a valid function name as well + string strTemp = key; + if (CheckFunction(strTemp)) + RuntimeError("variable %s is invalid, it is reserved because it is also the name of a function", key.c_str()); + + tokenStart = keyEnd; + if (stringParse[keyEnd] == '=') + ++tokenStart; + + // skip any spaces before the second token + tokenStart = stringParse.find_first_not_of(" \t", tokenStart); + } + std::string::size_type substrSize = tokenEnd - tokenStart; + + auto bracesEnd = FindBraces(stringParse, tokenStart); + + // if braces found, we modify the token end according to braces + if (bracesEnd != npos) + { // include the trailing brace + tokenEnd = bracesEnd+1; + substrSize = tokenEnd - tokenStart; + + // for quote delimited string remove quotes + if (stringParse[tokenStart] == '"') + { + tokenStart++; + substrSize -= 2; // take out the quotes + } + } + + if (substrSize == 0) + return npos; + + // get the value + std::string value = stringParse.substr(tokenStart, substrSize); + Trim(value); + + NDLNode* ndlNode = NULL; + + // check for a function/macro call + auto found = value.find_first_of(FUNCTIONOPEN); + if (found != npos && found > 0) // brace found after some text, so a call + { + ndlNode = ParseCall(value); + // check if we have a user defined name, ParseCall assigns a default name + if (!key.empty()) + ndlNode->SetName(key); + AddSymbol(ndlNode->GetName(),ndlNode); + m_script.push_back(ndlNode); + } + // if it's not a call, must be a variable + else + { + ndlNode = ParseVariable(value); + bool newNode = ndlNode->GetName().empty(); + AddSymbol(key,ndlNode); + + ndlNode->SetName(key); + if (newNode) //only need to add nodes that are new (not renames) + { + m_script.push_back(ndlNode); + } + } + + return tokenEnd; + } + + // ExpandMacro - Expand a macro into a new macro definition + // node - NDLNode that holds the macro call + // returns: new node with the expanded macro + NDLNode* ExpandMacro(const NDLNode* node) + { + assert(node->GetType() == ndlTypeMacroCall); // needs to be a macro call (not definition) + + std::string name = node->GetName(); + // if we are calling a macro make a new copy of it and execute that instead (macro expansion) + // we do this so the evalValues in the macros will be valid regardless of number of instantiations + NDLNode* newNode = new NDLNode(name, node->GetValue(), this, ndlTypeMacroCall); + NDLScript* newScript = new NDLScript(*node->GetScript()); + newNode->SetScript(newScript); + newNode->SetParamMacro(node->GetParamMacro()); + + // now get the parameters to the macro added + ConfigValue paramString = node->GetParamString(); + ParseParameters(newNode, paramString, true /*createNew*/); + newNode->SetParamString(paramString); + + // fixup the symbol table to point to this one instead + AssignSymbol(name, newNode); + return newNode; + } + + // Evaluate - Evaluate the script + // nodeEval - the node evaluator to call + // baseName - baseName for all labels + // pass - what NDLPass are we on? + // skipThrough - skip through this node, will skip eval for all nodes up to and including this one + NDLNode* Evaluate(NDLNodeEvaluator& nodeEval, const wstring& baseName, const NDLPass pass=ndlPassInitial, NDLNode* skipThrough=nullptr) + { + NDLNode* nodeLast = skipThrough; + bool skip = skipThrough != nullptr; + std::wstring prevBaseName = GetBaseName(); + SetBaseName(baseName); + + for (auto& node : m_script) + { + // if we are in skip mode, and we found the skipThrough node, + // move out of skip mode and start processing at next node + if (skip) + { + if (node == skipThrough) + skip = false; + continue; + } + + // if it's a macro call, call the macro + if (node->GetType() == ndlTypeMacroCall) + { + node->EvaluateMacro(nodeEval, baseName, pass); + nodeEval.ProcessOptionalParameters(node); + } + else + { + nodeEval.Evaluate(node, baseName, pass); + } + nodeLast = node; + } + SetBaseName(prevBaseName); + return nodeLast; + } +}; + +}}} diff --git a/MachineLearning/cn/SGD.h b/MachineLearning/cn/SGD.h index bffa9dcea..81de3cf69 100644 --- a/MachineLearning/cn/SGD.h +++ b/MachineLearning/cn/SGD.h @@ -1,1564 +1,1587 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#pragma once - -#include "basetypes.h" -#include "ComputationNetwork.h" -#include "ComputationNetworkHelper.h" -#include "SimpleEvaluator.h" -#include "DataReader.h" -#include -#include -#include -#include "fileutil.h" -#include "commandArgUtil.h" -#include -#include - -#ifdef MPI_SUPPORT -#include "mpi.h" -#endif -extern int myRank; -extern int numProcs; - -using namespace std; - -namespace Microsoft { namespace MSR { namespace CNTK { - - template - void DecimateMinibatch(std::map*> &mb) - { - size_t rv = 0; - if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it) - { - MSR::CNTK::Matrix &mat = *(it->second); - size_t nCols = mat.GetNumCols(); - size_t col_start = (nCols * myRank) / numProcs; - size_t col_end = (nCols*(myRank + 1)) / numProcs; - if (col_end > nCols) col_end = nCols; // this shouldn't happen - if (col_end == col_start) - { - MSR::CNTK::Matrix tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE); - mat.SetValue(tmp); - } - else - { - MSR::CNTK::Matrix tmp = mat.ColumnSlice(col_start, col_end - col_start); - mat.SetValue(tmp); - } - if (0 == rv) - { - rv = mat.GetNumCols(); - } - else - { - if (rv != mat.GetNumCols()) - throw std::logic_error("Uneven number of columns among inputs."); - } - } - } - - enum class LearningRateSearchAlgorithm : int - { - None, - AdjustAfterEpoch, - SearchBeforeEpoch - }; - - enum class AdaptationRegType : int - { - None, - KL - }; - - enum class GradientsUpdateType : int - { - None, - AdaGrad, - RmsProp - }; - - // configuration parameters associated with RMSProp learning algorithm - typedef struct stRMSPropInfo{ - double gamma; - double inc; - double dec; - double max; - double min; - stRMSPropInfo() - { - gamma = 0.99; - inc = 1.2; - dec = 0.75; - max = 10.0; - min = 0.1; - } - }RMSPropInfo; - - typedef struct stGradientUpdateInfo{ - GradientsUpdateType mType; - float mGaussianNoiseInjectStd; - stGradientUpdateInfo() - { - mType = GradientsUpdateType::AdaGrad; - mGaussianNoiseInjectStd = 0.0075f; - } - } GradientUpdateInfo; - - template - class SGD : ComputationNetworkHelper - { - protected: - typedef ComputationNetworkHelper B; - using B::SetMaxTempMemSizeForCNN; using B::SetDropoutRate; using B::UpdateEvalTimeStamps; - typedef ComputationNode* ComputationNodePtr; - typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr; - - public: - SGD(const ConfigParameters& configSGD) - { - ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", ""); - floatargvector learningRatesPerMB = learningRatesPerMBStr; - - ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", ""); - floatargvector learningRatesPerSample = learningRatesPerSampleStr; - - std::string executionEngineValue = configSGD("executionEngine", "synchronous"); - -#ifdef USE_PTASK - // use PTask if we have more than one GPU or the MultiGPU flag is set - bool usePtask = (g_bestGpu != NULL && g_bestGpu->UseMultiple()) || (bool)configSGD("MultiGPU", "false"); -#else - bool usePtask = false; -#endif - // AutoAdjust Parameters - ConfigParameters configAALR (configSGD("AutoAdjust","")); - LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None")); - ElemType reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0"); - bool continueReduce = (bool)configAALR("continueReduce", "false"); - size_t learnRateAdjustInterval = (size_t)configAALR("learnRateAdjustInterval", "1"); - ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618"); - ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");// std::numeric_limits::infinity()); - ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382"); - ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500"); - intargvector numMiniBatch4LRSearch = minibatch4LRSearch; - size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5"); - size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1"); - bool loadBestModel = configAALR("loadBestModel", "true"); - - ConfigArray minibatchSize = configSGD("minibatchSize", "256"); - intargvector mbSize = minibatchSize; - size_t epochSize = configSGD("epochSize", "0"); - - size_t maxEpochs = configSGD("maxEpochs"); - ConfigArray momentumPerMBStr = configSGD("momentumPerMB", ""); - floatargvector momentumPerMB = momentumPerMBStr; - - wstring modelPath = configSGD("modelPath"); - wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", ""); - wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", ""); - - size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0"); - - int traceLevel = configSGD("traceLevel", "0"); - size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10"); - - bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false"); - - bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true"); - ElemType clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF"); // std::numeric_limits::infinity()); - - ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0"); - floatargvector dropoutRates = dropoutRatesStr; - - GradientUpdateInfo gUpdateInfo; - GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None")); - ElemType gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0"); - gUpdateInfo.mType = gradUpdateType; - gUpdateInfo.mGaussianNoiseInjectStd = (float)gaussianNoiseInjecStd; - - // extract RMSProp parameters from config, if they exist. Default to reasonable values. - RMSPropInfo rpi; - rpi.dec = (double)configSGD("rms_wgt_dec", "0.75"); - rpi.inc = (double)configSGD("rms_wgt_inc", "1.2"); - rpi.min = (double)configSGD("rms_wgt_min", "0.1"); - rpi.max = (double)configSGD("rms_wgt_max", "10.0"); - rpi.gamma = (double)configSGD("rms_gamma", "0.99"); - - /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of - /// useAdagrad=true - bool useAdagrad = configSGD("useAdagrad", "false"); - if (useAdagrad) - { - gradUpdateType = GradientsUpdateType::AdaGrad; - gUpdateInfo.mType = gradUpdateType; - } - - AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None")); - ElemType adaptationRegWeight = configSGD("adaptationRegWeight", "0"); - - /// gradient check setup - bool doGradientCheck = configSGD("gradientcheck", "false"); - ElemType gradientCheckSigDigit = configSGD("sigFigs", "6"); - - bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true"); - - Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation, - clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor, - reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates, - loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult, - maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight, - trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading, - rpi, learnRateAdjustInterval); - } - - void setMomentum(float momentum) - { - m_momentumPerMB = (ElemType)momentum; - } - - //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample - void Init(const floatargvector& learningRatesPerMB, const floatargvector& learningRatesPerSample, const intargvector& mbSize, - const size_t epochSize, const size_t maxEpochs, - const wstring& modelPath, const floatargvector& momentumPerMB, const bool gradientClippingWithTruncation = true, - const ElemType clippingThresholdPerSample=std::numeric_limits::infinity(), - const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None, - const ElemType increaseLearnRateIfImproveMoreThan = std::numeric_limits::infinity(), const ElemType learnRateIncreaseFactor = 1.382f, - const ElemType reduceLearnRateIfImproveLessThan=0, const bool continueReduce=false, const ElemType learnRateDecreaseFactor = 0.618f, floatargvector dropoutRates = floatargvector(L"0.0f"), - const bool loadBestModel=true, const intargvector& numMiniBatch4LRSearch=intargvector(L"500"), const size_t numPrevLearnRates = 5, - const size_t numBestSearchEpoch = 1, const int traceLevel = 0, - const size_t numMBsToShowResult = 10, const size_t maxTempMemSizeInSamplesForCNN = 0, - const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None, - const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"", - const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true, - RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1) - { - numPrevLearnRates; - m_mbSize=mbSize; - m_epochSize=epochSize; - if (m_epochSize == 0) - { - m_epochSize = requestDataSize; - } - m_maxEpochs=maxEpochs; - - m_gradientClippingWithTruncation=gradientClippingWithTruncation; - m_modelPath=modelPath; - m_autoLearnRateSearchType=autoLearnRateSearchType; - m_traceLevel=traceLevel; - m_loadBestModel=loadBestModel; - m_increaseLearnRateIfImproveMoreThan=increaseLearnRateIfImproveMoreThan; - m_learnRateIncreaseFactor=learnRateIncreaseFactor; - m_reduceLearnRateIfImproveLessThan=reduceLearnRateIfImproveLessThan; - m_continueReduce=continueReduce; - m_learnRateAdjustInterval = max(1, learnRateAdjustInterval); //minimum interval is 1 epoch - m_learnRateDecreaseFactor=learnRateDecreaseFactor; - m_clippingThresholdPerSample=abs(clippingThresholdPerSample); - m_numMiniBatch4LRSearch=numMiniBatch4LRSearch; - m_dropoutRates=dropoutRates; - m_numMBsToShowResult=int(numMBsToShowResult); - m_numBestSearchEpoch=numBestSearchEpoch; - m_maxTempMemSizeInSamplesForCNN=maxTempMemSizeInSamplesForCNN; - m_gradType = gradUpdateType; - m_rpi = rpi; - m_usePtask = usePtask; - m_keepCheckPointFiles = keepCheckPointFiles; - - m_adaptationRegType = adaptationRegType; - m_adaptationRegWeight = adaptationRegWeight; - - m_trainCriterionNodeName = trainCriterionNodeName; - m_evalCriterionNodeName = evalCriterionNodeName; - - for (size_t i=0; i 0 && learningRatesPerMB.size() > 0) - { - throw std::invalid_argument ("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them."); - } - else if (learningRatesPerSample.size() > 0) - { - m_learningRatesPerSample=learningRatesPerSample; - } - else if (learningRatesPerMB.size() > 0) - { - int LRSize = (int)max(learningRatesPerMB.size(), m_mbSize.size()); - m_learningRatesPerSample.resize(LRSize); - for (int i=0; i0) - { - m_momentumInputPerMB=momentumPerMB; - if (m_momentumInputPerMB[0]>=1 || m_momentumInputPerMB[0]<0) - throw std::invalid_argument ("momentumPerMB must be in [0, 1)."); - } - - if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor<1) - { - throw std::invalid_argument ("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1."); - } - - for (size_t i=0; i= 1 || m_dropoutRates[i] < 0) - { - throw std::invalid_argument ("dropoutRate must be >= 0 and < 1."); - } - } - - if (m_adaptationRegWeight > 1 || m_adaptationRegWeight <0) - throw invalid_argument("adaptationRegWeight must be in [0 1]"); - - m_minLearnRate = 1e-9f; - - m_needRegularization = false; - - m_doGradientCheck = doGradientCheck; - m_gradientCheckSigDigit = gradientCheckSigDigit; - m_validateAfterModelReloading = validateAfterModelReloading; - - msra::files::make_intermediate_dirs (m_modelPath); - } - - void Adapt(wstring origModelFileName, wstring refNodeName, IDataReader* trainSetDataReader, IDataReader* validationSetDataReader, const DEVICEID_TYPE deviceID, const bool makeMode = true) - { - if (origModelFileName == L"" || trainSetDataReader == nullptr) - throw std::invalid_argument ("origModel and trainSetDataReader should not be null."); - - int startEpoch = DetermineStartEpoch(makeMode); - if (startEpoch == m_maxEpochs) - { - fprintf(stderr, "Final model exists. No further training is necessary.\n"); - return; - } - - ComputationNetwork net(deviceID); - if (startEpoch >= 0) - { - wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1); - fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str()); - net.LoadFromFile(modelFileName); - } - else - { - fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str()); - net.LoadFromFile(origModelFileName); - } - - startEpoch = max(startEpoch, 0); - - ComputationNetwork refNet(deviceID); - m_needRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0; - if (m_needRegularization) - { - fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str()); - refNet.LoadFromFile(origModelFileName); - } - - ComputationNodePtr refNode = nullptr; - if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL) - { - fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str()); - if (refNodeName == L"") - throw invalid_argument("refNodeName does not exist and is needed when adaptationRegType is KL."); - - refNode = refNet.GetNodeFromName(refNodeName); - } - - TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader); - } - - void Train(IComputationNetBuilder* netBuilder, IDataReader* trainSetDataReader, IDataReader* validationSetDataReader, const bool makeMode = true) - { - if (netBuilder == nullptr || trainSetDataReader == nullptr) - throw std::invalid_argument ("netBuilder and trainSetDataReader should not be null.\n"); - - int startEpoch = DetermineStartEpoch(makeMode); - if (startEpoch == m_maxEpochs) - { - fprintf(stderr, "Final model exists. No further training is necessary.\n"); - return; - } - - wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1); - if (startEpoch >= 0) - fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str()); - ComputationNetwork& net = - startEpoch<0? netBuilder->BuildNetworkFromDescription() : netBuilder->LoadNetworkFromFile(modelFileName); - // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model - // strategy should be to run the initializer above on myRank==0, and then broadcast parameters. - - startEpoch = max(startEpoch, 0); - m_needRegularization = false; - - TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader); - } - - protected: - std::vector GetTrainCriterionNodes(ComputationNetwork& net) - { - fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str()); - if (!m_trainCriterionNodeName.empty()) - { - std::vector nodes; - ComputationNodePtr node = net.GetNodeFromName(m_trainCriterionNodeName); - net.ValidateNetwork(node); - if (node->FunctionValues().GetNumElements() != 1) - throw invalid_argument("the trainCriterionNodeName specified in the config file is not a valid training criterion node."); - - nodes.push_back(node); - return nodes; - } - else - return net.FinalCriterionNodes(); - } - std::vector GetEvalCriterionNodes(ComputationNetwork& net) - { - fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str()); - if (!m_evalCriterionNodeName.empty()) - { - std::vector nodes; - ComputationNodePtr node = net.GetNodeFromName(m_evalCriterionNodeName); - net.ValidateNetwork(node); - if (node->FunctionValues().GetNumElements() != 1) - throw invalid_argument("the evalCriterionNodeName specified in the config file is not a valid evaluation criterion node."); - - nodes.push_back(node); - return nodes; - } - else - return net.EvaluationNodes(); - } - - void TrainOrAdaptModel(int startEpoch, ComputationNetwork& net, ComputationNetwork& refNet, ComputationNodePtr refNode, - IDataReader* trainSetDataReader, IDataReader* validationSetDataReader) - { - std::vector & FeatureNodes = net.FeatureNodes(); - std::vector & labelNodes = net.LabelNodes(); - std::vector criterionNodes = GetTrainCriterionNodes(net); - std::vector evaluationNodes = GetEvalCriterionNodes(net); - - std::map*> inputMatrices; - for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); - } - for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); - } - - // special handling of classed based softmax node. Need a better solution to it. - if (criterionNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode::TypeName() || - evaluationNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode::TypeName()) - { - size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows(); - int deviceId = FeatureNodes[0]->FunctionValues().GetDeviceId(); - inputMatrices[L"idx2cls"] = new Matrix(vSz, 1, (DEVICEID_TYPE)deviceId); - inputMatrices[L"classinfo"] = new Matrix(vSz, 1, (DEVICEID_TYPE)deviceId); - } - - - //used for KLD regularized adaptation. For all other adaptation techniques use MEL to edit the model and using normal training algorithm - std::vector refFeatureNodes; - if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) - { - refFeatureNodes.resize(FeatureNodes.size()); - for (size_t i=0; iNodeName()); //we need to keep this info to handle deletion - refNet.ChangeNode(FeatureNodes[i]->NodeName(), FeatureNodes[i]); - } - - refNet.RebuildNetwork(refNode); - } - - //initializing weights and gradient holder - std::list& learnableNodes = net.LearnableNodes(criterionNodes[0]); //only one criterion so far TODO: support multiple ones? - std::list> smoothedGradients; - - for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = (*nodeIter); - smoothedGradients.push_back(Matrix(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(),net.GetDeviceID())); - } - - ElemType epochCriterion, avgCriterion, prevCriterion; - epochCriterion = avgCriterion = prevCriterion = std::numeric_limits::infinity(); - size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval; - - std::vector epochEvalErrors(evaluationNodes.size(),std::numeric_limits::infinity()); - - std::vector evalNodeNames; - for (size_t i=0;iNodeName()); - - size_t totalSamplesSeen = 0; - ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch]; - - int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation - vector prevLearnRates; - prevLearnRates.resize(m_numPrevLearnRates); - for (int i=0; i::infinity(); - - //precompute mean and invStdDev nodes and save initial model - if (PreCompute(net, trainSetDataReader, FeatureNodes, labelNodes, inputMatrices) || startEpoch == 0) - if (0 == myRank) // only needs to be done by one process - net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1)); - - bool learnRateInitialized = false; - if (startEpoch > 0) - { - learnRateInitialized = LoadCheckPointInfo(startEpoch-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); - setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size()-1]); - } - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch) - throw std::invalid_argument ("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); - - unsigned long dropOutSeed = 1; - ElemType prevDropoutRate = 0; - - bool learnRateReduced = false; - - SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN); - if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) - SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN); - - // build the PTask graph if they want to use ptask - // NOTE: the graph is currently only for training, so other operations will still use the usual method, - // (i.e rate adjustment and other custom operations still use the non PTask method) - if (m_usePtask) - { - // set the minibatch size to the largest thing we will ever see - int maxMbSize = 0; - for (int val : m_mbSize) - { - maxMbSize = max(val, maxMbSize); - } - net.SetActualMiniBatchSize(maxMbSize); - net.BuildPTaskGraph(); - } - - for (int i = int(startEpoch); i < int(m_maxEpochs); i++) - { - auto t_start_epoch = clock(); - - // set other information to inputMatrices that can contrain information - // used for class-based LM for clustring information - SetOtherInfo(net, trainSetDataReader, validationSetDataReader, inputMatrices); - - //set dropout rate - SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); - - //learning rate adjustment - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) - { - learnRatePerSample = m_learningRatesPerSample[i]; - setMomentum(m_momentumInputPerMB[i]); - } - else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) - { - ElemType largestPrevLearnRatePerSample = prevLearnRates[0]; - for (int j = 1; j < m_numPrevLearnRates; j++) - { - largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]); - } - - //return a reasonable learning rate based on the initial mbsize - learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes, - labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample); - - prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample; //save per sample learn rate to support changeable mbsize - } - - learnRateInitialized = true; - - if (learnRatePerSample < m_minLearnRate) - { - fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); - if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None) - net.SaveToFile(m_modelPath); - break; - } - -#ifdef MPI_SUPPORT - INT32 mySamples = (INT32) -#endif - TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes, - criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - - auto t_end_epoch = clock(); - ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC); - - fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion); - if (epochEvalErrors.size() == 1) - { - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); - } - else - { - fprintf(stderr, "EvalErr Per Sample "); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime); - fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]); - } - -#ifdef MPI_SUPPORT - // model reduction and averaging - if (numProcs > 0) - { - ElemType factor; // weight for the parameter of my model - { - // compute total minibatch size - INT32 allSamples = 0; - MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (allSamples == 0) allSamples = 1; - - factor = (ElemType)mySamples / (ElemType)allSamples; - } - - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = (*nodeIter); - Microsoft::MSR::CNTK::Matrix &mat = node->FunctionValues(); - - // weight model by relative size of minibatch samples (and number of processors, for averaging) - ElemType *px = mat.CopyToArray(); - size_t nx = mat.GetNumElements(); - transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; }); - - // TODO: Replace default Allreduce with the reduction-shuffle-dance - vector py = vector(nx, ElemType(0)); - MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0])); - delete px; - } - } -#endif - - if ( 0 == myRank ) // only evaluate once, on the master process. TODO: This could be faster by farming out the validation parts - if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr) - { - SimpleEvaluator evalforvalidation(net); - vector cvSetTrainAndEvalNodes; - cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName()); - cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); - - vector vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); - fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n", - i + 1, vScore[0], vScore[1]); - - epochCriterion = vScore[0]; //the first one is the training criterion. - } -#ifdef MPI_SUPPORT - // ensure all processes have the same epochCriterion - MPI_Bcast(&epochCriterion, 1, sizeof(epochCriterion) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD); -#endif - - bool loadedPrevModel = false; - size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1; - if (avgCriterion == std::numeric_limits::infinity()) - avgCriterion = epochCriterion; - else - avgCriterion = ((epochsSinceLastLearnRateAdjust -1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion); - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) - { - if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity()) - { - if (m_loadBestModel) - { - net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i-1), m_validateAfterModelReloading); - net.ResetEvalTimeStamp(); - LoadCheckPointInfo(i-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); - fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); - loadedPrevModel = true; - } - } - - if(m_continueReduce) - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - if(learnRateReduced == false) - { - learnRateReduced = true; - } - else - { - if ( myRank == 0 ) - net.SaveToFile(GetModelNameForEpoch(i, true)); - fprintf(stderr, "Finished training and saved final model\n\n"); - break; - } - } - if(learnRateReduced) - { - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - } - else - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - learnRatePerSample *= m_learnRateIncreaseFactor; - fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample); - } - } - } - - if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them - { - prevCriterion = avgCriterion; - epochsNotCountedInAvgCriterion = 0; - } - - //persist model and check-point info - if (0 == myRank) - { - net.SaveToFile(GetModelNameForEpoch(i)); - SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); - if (!m_keepCheckPointFiles) - _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space - } - - if (learnRatePerSample < 1e-12) - fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); - } - - if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //since we linked feature nodes. we need to remove it from the deletion - { - for (size_t i=0; iNodeName(), refFeatureNodes[i]); //note we need to handle deletion carefully - } - } - - if (inputMatrices[L"classinfo"]) - { - delete inputMatrices[L"classinfo"]; - inputMatrices.erase(L"classinfo"); - } - if (inputMatrices[L"idx2cls"]) - { - delete inputMatrices[L"idx2cls"]; - inputMatrices.erase(L"idx2cls"); - } - - } - - protected: - - //return true if precomputation is executed. - bool PreCompute(ComputationNetwork& net, - IDataReader* trainSetDataReader, - std::vector& FeatureNodes, - std::vector& labelNodes, - std::map*>& inputMatrices) - { - std::list nodes = net.GetNodesRequirePreComputation(); - - if (nodes.size() == 0) - { - fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n"); - return false; - } - - fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size()); - for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) - { - PreComputedNode* node = static_cast*> (*nodeIter); - fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str()); - } - - //compute - //trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize); - trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch - - while (trainSetDataReader->GetMinibatch(inputMatrices)) - { - UpdateEvalTimeStamps(FeatureNodes); - UpdateEvalTimeStamps(labelNodes); - - size_t actualMBSize = net.GetActualMBSize(); - net.SetActualMiniBatchSize(actualMBSize); - net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); - trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); - - for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++) - { - net.Evaluate( *nodeIter); - } - } - - //mark done - for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++) - { - PreComputedNode* node = static_cast*> (*nodeIter); - node->MarkComputed(true); - } - - return true; - } - - //return a reasonable initial learning rate based on the initial mbsize - ElemType SearchLearnRateBeforeEpoch(ComputationNetwork& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, - const int epochNumber, const ElemType curLearnRate, - IDataReader* trainSetDataReader, - const std::vector& FeatureNodes, - const std::vector& labelNodes, - const std::vector& criterionNodes, - const std::vector& evaluationNodes, - std::map*>& inputMatrices, - const std::list& learnableNodes, - std::list>& smoothedGradients, const bool /*learnRateInitialized*/, const ElemType largestPrevLearnRatePerSample) - { - ElemType epochCriterion = std::numeric_limits::infinity(), prevCriterion = std::numeric_limits::infinity(); - vector epochEvalErrors(evaluationNodes.size(),std::numeric_limits::infinity()); - //ElemType epochEvalError = std::numeric_limits::infinity(); - size_t totalSamplesSeen = 0; - ElemType bestLearnRatePerSample = curLearnRate; - - size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; - if (m_epochSize != requestDataSize) - { - epochSize = min(epochSize, m_epochSize); //use a small number minibatches to make decision - } - - ElemType baseCriterion; - - ElemType minLearnRate = m_minLearnRate * 0.3f; - ElemType learnRatePerSample = 1.0f / 8.0f / 0.618f /sqrt((ElemType)m_mbSize[epochNumber]); - - if (largestPrevLearnRatePerSample != std::numeric_limits::infinity()) - learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f; //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety - - int baseModelEpoch = epochNumber-1; - net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading); - net.ResetEvalTimeStamp(); - - ElemType learnRate =learnRatePerSample; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion); - - //if model is not changed this is what we will get - TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, 0, - FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, baseCriterion, epochEvalErrors, totalSamplesSeen); - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) - { - if (prevCriterion == std::numeric_limits::infinity()) - prevCriterion = baseCriterion; - ElemType ratio = 0.3f; - if (m_epochSize != requestDataSize) - { - ratio = pow(((ElemType)epochSize) / m_epochSize, 1.0f/2); - } - baseCriterion = max(ratio * prevCriterion + (1-ratio) * baseCriterion, baseCriterion); - } - - do - { - learnRatePerSample *= 0.618f; - TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample, - FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen); - - } while (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate); - - bestLearnRatePerSample = learnRatePerSample; - - if (epochNumber < m_numBestSearchEpoch) //grid search for the first m_numBestSearchEpoch epochs - { - ElemType leftLearnRatePerSample = 0.01f / m_mbSize[epochNumber], rightLearnRatePerSample = learnRatePerSample; - ElemType leftCriterion, rightCriterion = epochCriterion; - - TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample, - FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen); - - while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f) - { - if (rightCriterion > leftCriterion) - { - rightLearnRatePerSample *= 0.618f; - - TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, rightLearnRatePerSample, - FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, rightCriterion, epochEvalErrors, totalSamplesSeen); - } - else - { - leftLearnRatePerSample /= 0.618f; - - TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample, - FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen); - } - } - - bestLearnRatePerSample = (leftCriterion < rightCriterion)? leftLearnRatePerSample : rightLearnRatePerSample; - } - - fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n", epochNumber+1, bestLearnRatePerSample, baseCriterion); - - return bestLearnRatePerSample; - } - - void TrainOneMiniEpochAndReloadModel(ComputationNetwork& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, - const int epochNumber,const size_t epochSize, IDataReader* trainSetDataReader, const ElemType learnRatePerSample, - const std::vector& FeatureNodes, - const std::vector& labelNodes, - const std::vector& criterionNodes, - const std::vector& evaluationNodes, - std::map*>& inputMatrices, - const std::list& learnableNodes, - std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, size_t& totalSamplesSeen) - { - TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes, - criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g ", epochCriterion); - if (epochEvalErrors.size()==1) - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g\n", epochEvalErrors[0], learnRatePerSample); - else - { - fprintf(stderr, "EvalErr Per Sample "); - for (size_t i=0; i& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, - const int epochNumber, const size_t epochSize, - IDataReader* trainSetDataReader, const ElemType learnRatePerSample, - const std::vector& FeatureNodes, - const std::vector& labelNodes, - const std::vector& criterionNodes, - const std::vector& evaluationNodes, - std::map*>& inputMatrices, - const std::list& learnableNodes, - std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, size_t& totalSamplesSeen) - { - ElemType readTimeInMBs = 0, ComputeTimeInMBs = 0, epochCriterionLastMBs = 0; - int numSamplesLastMBs = 0; - std::vector epochEvalErrorsLastMBs(epochEvalErrors.size(),0); - PTaskGraphBuilder* ptaskGraphBuilder = NULL; - - clock_t startReadMBTime = 0, startComputeMBTime=0; - clock_t endReadMBTime=0, endComputeMBTime=0; - - //initialize statistics - size_t totalEpochSamples = 0; - - int numMBsRun = 0; - bool beginEpoch = true; - - size_t numEvalNodes = epochEvalErrors.size(); - - // NOTE: the following two local matrices are not used in PTask path - Matrix localEpochCriterion(1,1,net.GetDeviceID()); //assume only one training criterion node for each epoch - Matrix localEpochEvalErrors(1,numEvalNodes,net.GetDeviceID()); - - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - - if (m_usePtask) - { - epochCriterion = ElemType(0.0); - epochEvalErrors.assign(numEvalNodes, ElemType(0.0)); - } - - trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); - - // build the PTask graph if they want to use ptask - // NOTE: the graph is currently only for training, so other operations will still use the usual method, - // (i.e rate adjustment, regularization and other custom operations still use the non PTask method) - if (m_usePtask) - { - ptaskGraphBuilder = net.GetPTaskGraphBuilder(); - ptaskGraphBuilder->UpdateParameters(this, learnRatePerSample, m_mbSize[epochNumber]); - ptaskGraphBuilder->StartPTaskGraph(); - - // currently CNTK likes to keep things on the GPU, and PTask expects things to be on the CPU, so tell CNTK to keep data on the CPU - for (std::pair*> inpair : inputMatrices) - { - Matrix* mat = inpair.second; - mat->SetPreferredDeviceId(CPUDEVICE); - mat->TransferFromDeviceToDevice(mat->GetDeviceId(), CPUDEVICE, true); - } - } - - startReadMBTime=clock(); - while (trainSetDataReader->GetMinibatch(inputMatrices)) - { -#ifdef MPI_SUPPORT - DecimateMinibatch(inputMatrices); -#endif - endReadMBTime=clock(); - startComputeMBTime=clock(); - - UpdateEvalTimeStamps(FeatureNodes); - UpdateEvalTimeStamps(labelNodes); - - size_t actualMBSize = net.GetActualMBSize(); - if (0 == actualMBSize) - continue; - - net.SetActualMiniBatchSize(actualMBSize); - net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); - trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); - -#ifndef EVALDLL - if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false) - { - throw std::logic_error("cannot pass gradient checker"); - } -#endif - if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //TODO: currently only support one node regularization - { - refNet.SetActualMiniBatchSize(actualMBSize); - refNet.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); - refNet.Evaluate(refNode); - Matrix::ScaleAndAdd(m_adaptationRegWeight, refNode->FunctionValues(), 1-m_adaptationRegWeight, labelNodes[0]->FunctionValues()); - } - - if (m_usePtask) - { - // Pushing data in the graph starts things going - bool endOfEpoch = trainSetDataReader->DataEnd(endDataEpoch); - CONTROLSIGNAL signal = beginEpoch?DBCTLC_BOF:DBCTLC_NONE; - if (endOfEpoch) - signal |= DBCTLC_EOF; - - ptaskGraphBuilder->PushData(inputMatrices, signal); - ptaskGraphBuilder->PushActualMBSize(learnableNodes, net.GetActualMBSize(), signal); - beginEpoch = false; // clear this out after first epoch - - // pull the values from the graph for the totals - epochCriterion += ptaskGraphBuilder->GetValue(criterionNodes[0]); - for (size_t i=0; iGetValue(evaluationNodes[i]); - } - - // NOTE: update model parameters is part of the graph, so nothing to do here - } - else - { - if (learnRatePerSample > m_minLearnRate * 0.01) //only compute gradient when learning rate is large enough - net.ComputeGradient(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more? - else - net.Evaluate(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more? - - Matrix::AddElementToElement(criterionNodes[0]->FunctionValues(), 0, 0, localEpochCriterion, 0, 0); - - std::vectormbEvalErrors(numEvalNodes,0); - for (size_t i=0; i::AddElementToElement(evaluationNodes[i]->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i); - } - - //update model parameters - if (learnRatePerSample > m_minLearnRate * 0.01) - { - auto smoothedGradientIter=smoothedGradients.begin(); - for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++) - { - ComputationNodePtr node = (*nodeIter); - Matrix& smoothedGradient = (*smoothedGradientIter); - - UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber]); - } - } - } - - - endComputeMBTime=clock(); - numMBsRun ++; - if (m_traceLevel > 0) - { - ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(CLOCKS_PER_SEC); - ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/CLOCKS_PER_SEC; - - readTimeInMBs += MBReadTime; - ComputeTimeInMBs += MBComputeTime; - numSamplesLastMBs += int(actualMBSize); - - if (numMBsRun % m_numMBsToShowResult == 0) - { - if (!m_usePtask) - { // get the epoch Values updated, in PTask don't use the loclEpoch* temporary matrices - epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i=0; i< numEvalNodes; i++) - epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i); - } - - fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d Train Loss Per Sample = %.8g ",epochNumber+1, numMBsRun-m_numMBsToShowResult+1, numMBsRun, numSamplesLastMBs, - (epochCriterion-epochCriterionLastMBs)/numSamplesLastMBs); - for (size_t i=0; i= epochSize) - break; - - /// call DataEnd function - /// DataEnd does reader specific process if sentence ending is reached - trainSetDataReader->DataEnd(endDataSentence); - - } - - if (m_usePtask) - { - // when the epoch is complete, we need to transfer all the values back to the LearnableNodes, which will be saved off as the model - std::list learnableNodes = net.LearnableNodes(criterionNodes[0]); - for (ComputationNodePtr node : learnableNodes) - { - ptaskGraphBuilder->GetValue(node, node->FunctionValues()); - } - epochCriterion /= float(totalEpochSamples); - for (size_t i=0; i< numEvalNodes; i++) - { - epochEvalErrors[i] /= float(totalEpochSamples); - } - } - else - { - localEpochCriterion /= float(totalEpochSamples); - localEpochEvalErrors /= float(totalEpochSamples); - - epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i=0; i< numEvalNodes; i++) - { - epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i); - } - } - return totalEpochSamples; - } -public: - // UpdateWeightsS - static version of UpdateWeights() - static void UpdateWeightsS(const SGD* sgd, Matrix& functionValues, Matrix& gradientValues, Matrix& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize) - { -#if DUMPOUTPUT - fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize); - fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB()); - gradientValues.Print("Gradient Input"); - smoothedGradient.Print("Smoothed Gradient Input"); -#endif - - // make actualMBSize is a valid value - assert(actualMBSize > 0); - - //clipping gradients to prevent outliers - sgd->ClipGradient(gradientValues, actualMBSize); - - GradientsUpdateType adpType = sgd->GradUpdateType(); - ElemType noiseStd = sgd->GradientUpdateNoiseStd(); - Matrix sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId()); - if (noiseStd > 0) - { - sgdUpdateNoise.SetValue(gradientValues); /// get the gradient structure since gradient is sparse - sgdUpdateNoise.SetGaussianRandomValue(0, noiseStd); // reset its value to random - } - - if (adpType == GradientsUpdateType::None) - { - ElemType momentum = sgd->MomentumPerMB(); - if (actualMBSize < expectedMBSize && momentum > 0.0000001f) //we use simple linear (instead of log linear) scaling here - { - momentum = (ElemType) exp (log(momentum)/expectedMBSize * actualMBSize); - } - smoothedGradient.NormalGrad(gradientValues, functionValues, learnRatePerSample, momentum); - } - if (adpType == GradientsUpdateType::AdaGrad) - { - smoothedGradient.Adagrad(gradientValues); - Matrix::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues); - } - if (adpType == GradientsUpdateType::RmsProp) - { - // include L2 regularizer - Matrix::ScaleAndAdd((ElemType)0.001, functionValues, gradientValues); - smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma, (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max, (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min); - Matrix::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues); - } - - if (noiseStd > 0) - { - Matrix::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues); - } -#if DUMPOUTPUT - functionValues.Print("Parameter Update"); -#endif - } -protected: - // UpdateWeights - update the weights in - void UpdateWeights(const ComputationNodePtr node, Matrix& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize) const - { -#if DUMPOUTPUT - fprintf(stderr, "Update_%ls\n",node->NodeName().c_str()); -#endif - UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize); - node->UpdateEvalTimeStamp(); - } - - void ClipGradient(Matrix& gradient, const size_t actualMBSize) const - { - if (m_clippingThresholdPerSample != std::numeric_limits::infinity()) - { - ElemType maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize; - if (m_gradientClippingWithTruncation) - { - gradient.InplaceTruncate(maxGradientPerMB); - } - else //norm2 normalized - { - ElemType gradientNorm = gradient.FrobeniusNorm(); - if (gradientNorm > maxGradientPerMB) - { - ElemType normFactor = maxGradientPerMB / gradientNorm; - gradient *= normFactor; - } - } - } - } - - void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, - const std::list>& smoothedGradients, const ElemType prevCriterion) - { - wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); - - File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite); - fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP"); - - fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate"); - fstream << totalSamplesSeen << learnRatePerSample << prevCriterion; - fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); - - fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); - - for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) - { - const Matrix& smoothedGradient = (*smoothedGradientIter); - fstream << smoothedGradient; - } - fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient"); - - fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP"); - } - - bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, ElemType& learnRatePerSample, - std::list>& smoothedGradients, ElemType& prevCriterion) - { - wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); - if (!fexists(checkPointFileName.c_str()) ) - { - fprintf(stderr, "Warning: checkpiont file is missing. learning parameters will be initialized from 0\n"); - return false; - } - - File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead); - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP"); - - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate"); - fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion; - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); - - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); - - for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) - { - Matrix& smoothedGradient = (*smoothedGradientIter); - fstream >> smoothedGradient; - } - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient"); - - fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP"); - - return true; - } - - wstring GetCheckPointFileNameForEpoch (const int epoch) - { - return GetModelNameForEpoch (epoch) + L".ckp"; - } - - wstring GetModelNameForEpoch (const int epoch, bool bLastModel = false) - { - int epoch1Base = epoch + 1; - if (epoch1Base == m_maxEpochs || bLastModel) - return m_modelPath; - else - return msra::strfun::wstrprintf (L"%s.%d", m_modelPath.c_str(), (int) epoch1Base); - } - - //return -1 if nothing exists - int DetermineStartEpoch (const bool makeMode) - { - if (!makeMode) - return -1; //always start from scratch - - int firstEpoch = -1; - - wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs)-1); - for (int e = int(m_maxEpochs)-1; e >= -1; e--) - { - const wstring prevEpochFile = GetModelNameForEpoch (e-1); - - if (msra::files::fuptodate (curEpochFile, prevEpochFile, false)) - { - firstEpoch = size_t(e)+1; - break; - } - else - curEpochFile = prevEpochFile; - } - - return firstEpoch; - } - - AdaptationRegType ParseAdaptationRegType(wstring s) - { - msra::strfun::tolower_ascii(s); - if (s == L"" || s == L"none") - return AdaptationRegType::None; - else if (s == L"kl" || s == L"klreg" ) - return AdaptationRegType::KL; - else - throw std::invalid_argument( - "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are " - "(None | KL)"); - } - - GradientsUpdateType ParseGradUpdateType(wstring s) - { - msra::strfun::tolower_ascii(s); - if (s == L"" || s == L"none") - return GradientsUpdateType::None; - else if (s == L"adagrad") - return GradientsUpdateType::AdaGrad; - else if (s == L"rmsprop") - return GradientsUpdateType::RmsProp; - else - throw std::invalid_argument( - "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are " - "(None | AdaGrad | RmsProp )"); - } - - LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s) - { - msra::strfun::tolower_ascii(s); - if (s == L"false" || s == L"none") - return LearningRateSearchAlgorithm::None; - else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before") - return LearningRateSearchAlgorithm::SearchBeforeEpoch; - else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after") - return LearningRateSearchAlgorithm::AdjustAfterEpoch; - else - throw std::invalid_argument( - "autoAdjustLR: Invalid learning rate search type. Valid values are " - "(None | SearchBeforeEpoch | AdjustAfterEpoch)"); - } - - GradientsUpdateType GradUpdateType() const {return m_gradType.mType;} - ElemType GradientUpdateNoiseStd() const {return m_gradType.mGaussianNoiseInjectStd;} - ElemType MomentumPerMB() const {return m_momentumPerMB;} - - public: - #define EPSILON 1e-5 - - bool GradientCheck( - ComputationNetwork& net, - const std::vector& criterionNodes, - const std::list& learnableNodes, - int npos) - { - // gradient checking - for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = (*nodeIter); - - int irow = (int)fmod(rand(), node->FunctionValues().GetNumRows()-1); - int icol = (int)fmod(rand(), node->FunctionValues().GetNumCols()-1); - irow = max(0, irow); - icol = max(0, icol); - - fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str()); - // node->FunctionValues().Print(); - ElemType eOrg = node->FunctionValues()(irow,icol); - - node->UpdateEvalTimeStamp(); - net.ComputeGradient(criterionNodes[npos]); //use only the first criterion. Is - //ElemType mbEvalCri = - criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar - ElemType eGradErr = node->GradientValues()(irow, icol); - - ElemType ePos = eOrg + ElemType(EPSILON); - ElemType eNeg = eOrg - ElemType(EPSILON); - - node->FunctionValues()(irow, icol) = ePos; - node->UpdateEvalTimeStamp(); - net.Evaluate(criterionNodes[npos]); - ElemType mbEvalCriPos = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar - - node->FunctionValues()(irow, icol) = eNeg; - node->UpdateEvalTimeStamp(); - net.Evaluate(criterionNodes[npos]); - ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar - - // back to its orginal parameter value - node->FunctionValues()(irow, icol) = eOrg; - - // check if they are consistent - ElemType eGradNum = (ElemType)((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg)); - ElemType threshold = (ElemType)pow((ElemType)10.0, max((ElemType)0.0, ceil(log10(min(fabs(eGradErr), fabs(eGradNum))))) - (int)m_gradientCheckSigDigit); - ElemType diff = (ElemType)fabs(eGradErr - eGradNum); - bool wrong = (std::isnan(diff) || diff > threshold); - if (wrong) - { - fprintf (stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n", node->NodeName().c_str(), eGradNum, eGradErr); - return false; - } - } - - return true; - } - - void SetOtherInfo(ComputationNetwork& net , IDataReader* /*trainSetDataReader*/, IDataReader* /*validSetDataReader*/, std::map*>& inputMatrices) - { - std::vector criterionNodes = net.FinalCriterionNodes(); - std::vector evaluationNodes = net.EvaluationNodes(); - - //initializing weights and gradient holder - for (size_t i = 0; i < criterionNodes.size(); i++) - { - if (criterionNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax") - { - ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) criterionNodes[i]; - crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); - } - } - - for (size_t i=0;iOperationName() == L"ClassBasedCrossEntropyWithSoftmax") - { - ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evaluationNodes[i]; - crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); - } - } - } - - protected: - - floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside - intargvector m_mbSize; - size_t m_epochSize; - size_t m_maxEpochs; - floatargvector m_momentumInputPerMB; - ElemType m_momentumPerMB; - bool m_gradientClippingWithTruncation; - ElemType m_clippingThresholdPerSample; - - wstring m_modelPath; - wstring m_trainCriterionNodeName; - wstring m_evalCriterionNodeName; - - intargvector m_numMiniBatch4LRSearch; - size_t m_numBestSearchEpoch; - - LearningRateSearchAlgorithm m_autoLearnRateSearchType; - - AdaptationRegType m_adaptationRegType; - ElemType m_adaptationRegWeight; - bool m_needRegularization; - - bool m_loadBestModel; - ElemType m_reduceLearnRateIfImproveLessThan; - bool m_continueReduce; - size_t m_learnRateAdjustInterval; //determine after how many epochs the learning rate should be auto adjusted. - ElemType m_increaseLearnRateIfImproveMoreThan; - ElemType m_learnRateIncreaseFactor; - ElemType m_learnRateDecreaseFactor; - - floatargvector m_dropoutRates; - size_t m_maxTempMemSizeInSamplesForCNN; - - int m_traceLevel; - - size_t m_numPrevLearnRates; - - ElemType m_minLearnRate; - - GradientUpdateInfo m_gradType; - RMSPropInfo m_rpi; - - bool m_usePtask; - - bool m_keepCheckPointFiles; - - int m_numMBsToShowResult; - - bool m_doGradientCheck; - ElemType m_gradientCheckSigDigit; - - bool m_validateAfterModelReloading; - }; - template class SGD; - template class SGD; - -}}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#pragma once + +#include "basetypes.h" +#include "ComputationNetwork.h" +#include "ComputationNetworkHelper.h" +#include "SimpleEvaluator.h" +#include "DataReader.h" +#include +#include +#include +#include "fileutil.h" +#include "commandArgUtil.h" +#include +#include +#include "TimerUtility.h" + +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +extern int myRank; +extern int numProcs; + +using namespace std; + +namespace Microsoft { namespace MSR { namespace CNTK { + + template + void DecimateMinibatch(std::map*> &mb) + { + size_t rv = 0; + if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it) + { + MSR::CNTK::Matrix &mat = *(it->second); + size_t nCols = mat.GetNumCols(); + size_t col_start = (nCols * myRank) / numProcs; + size_t col_end = (nCols*(myRank + 1)) / numProcs; + if (col_end > nCols) col_end = nCols; // this shouldn't happen + if (col_end == col_start) + { + MSR::CNTK::Matrix tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE); + mat.SetValue(tmp); + } + else + { + MSR::CNTK::Matrix tmp = mat.ColumnSlice(col_start, col_end - col_start); + mat.SetValue(tmp); + } + if (0 == rv) + { + rv = mat.GetNumCols(); + } + else + { + if (rv != mat.GetNumCols()) + throw std::logic_error("Uneven number of columns among inputs."); + } + } + } + + enum class LearningRateSearchAlgorithm : int + { + None, + AdjustAfterEpoch, + SearchBeforeEpoch + }; + + enum class AdaptationRegType : int + { + None, + KL + }; + + enum class GradientsUpdateType : int + { + None, + AdaGrad, + RmsProp + }; + + // configuration parameters associated with RMSProp learning algorithm + typedef struct stRMSPropInfo{ + double gamma; + double inc; + double dec; + double max; + double min; + stRMSPropInfo() + { + gamma = 0.99; + inc = 1.2; + dec = 0.75; + max = 10.0; + min = 0.1; + } + }RMSPropInfo; + + typedef struct stGradientUpdateInfo{ + GradientsUpdateType mType; + float mGaussianNoiseInjectStd; + stGradientUpdateInfo() + { + mType = GradientsUpdateType::AdaGrad; + mGaussianNoiseInjectStd = 0.0075f; + } + } GradientUpdateInfo; + + template + class SGD : ComputationNetworkHelper + { + protected: + typedef ComputationNetworkHelper B; + using B::SetMaxTempMemSizeForCNN; using B::SetDropoutRate; using B::UpdateEvalTimeStamps; + typedef ComputationNode* ComputationNodePtr; + typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr; + + public: + SGD(const ConfigParameters& configSGD) + { + ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", ""); + m_needToNormalizeLRByParallUtterance = false; + floatargvector learningRatesPerMB = learningRatesPerMBStr; + + ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", ""); + floatargvector learningRatesPerSample = learningRatesPerSampleStr; + + std::string executionEngineValue = configSGD("executionEngine", "synchronous"); + +#ifdef USE_PTASK + // use PTask if we have more than one GPU or the MultiGPU flag is set + bool usePtask = (g_bestGpu != NULL && g_bestGpu->UseMultiple()) || (bool)configSGD("MultiGPU", "false"); +#else + bool usePtask = false; +#endif + // AutoAdjust Parameters + ConfigParameters configAALR (configSGD("AutoAdjust","")); + LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None")); + ElemType reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0"); + bool continueReduce = (bool)configAALR("continueReduce", "false"); + size_t learnRateAdjustInterval = (size_t)configAALR("learnRateAdjustInterval", "1"); + ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618"); + ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");// std::numeric_limits::infinity()); + ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382"); + ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500"); + intargvector numMiniBatch4LRSearch = minibatch4LRSearch; + size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5"); + size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1"); + bool loadBestModel = configAALR("loadBestModel", "true"); + + ConfigArray minibatchSize = configSGD("minibatchSize", "256"); + intargvector mbSize = minibatchSize; + size_t epochSize = configSGD("epochSize", "0"); + + size_t maxEpochs = configSGD("maxEpochs"); + ConfigArray momentumPerMBStr = configSGD("momentumPerMB", ""); + floatargvector momentumPerMB = momentumPerMBStr; + + wstring modelPath = configSGD("modelPath"); + wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", ""); + wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", ""); + + size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0"); + + int traceLevel = configSGD("traceLevel", "0"); + size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10"); + + bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false"); + + bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true"); + ElemType clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF"); // std::numeric_limits::infinity()); + + ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0"); + floatargvector dropoutRates = dropoutRatesStr; + + GradientUpdateInfo gUpdateInfo; + GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None")); + ElemType gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0"); + gUpdateInfo.mType = gradUpdateType; + gUpdateInfo.mGaussianNoiseInjectStd = (float)gaussianNoiseInjecStd; + + // extract RMSProp parameters from config, if they exist. Default to reasonable values. + RMSPropInfo rpi; + rpi.dec = (double)configSGD("rms_wgt_dec", "0.75"); + rpi.inc = (double)configSGD("rms_wgt_inc", "1.2"); + rpi.min = (double)configSGD("rms_wgt_min", "0.1"); + rpi.max = (double)configSGD("rms_wgt_max", "10.0"); + rpi.gamma = (double)configSGD("rms_gamma", "0.99"); + + /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of + /// useAdagrad=true + bool useAdagrad = configSGD("useAdagrad", "false"); + if (useAdagrad) + { + gradUpdateType = GradientsUpdateType::AdaGrad; + gUpdateInfo.mType = gradUpdateType; + } + + AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None")); + ElemType adaptationRegWeight = configSGD("adaptationRegWeight", "0"); + + /// gradient check setup + bool doGradientCheck = configSGD("gradientcheck", "false"); + ElemType gradientCheckSigDigit = configSGD("sigFigs", "6"); + + bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true"); + + bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true"); + + Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation, + clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor, + reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates, + loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult, + maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight, + trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading, + rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode); + } + + void setMomentum(float momentum) + { + m_momentumPerMB = (ElemType)momentum; + } + + //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample + void Init(const floatargvector& learningRatesPerMB, const floatargvector& learningRatesPerSample, const intargvector& mbSize, + const size_t epochSize, const size_t maxEpochs, + const wstring& modelPath, const floatargvector& momentumPerMB, const bool gradientClippingWithTruncation = true, + const ElemType clippingThresholdPerSample=std::numeric_limits::infinity(), + const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None, + const ElemType increaseLearnRateIfImproveMoreThan = std::numeric_limits::infinity(), const ElemType learnRateIncreaseFactor = 1.382f, + const ElemType reduceLearnRateIfImproveLessThan=0, const bool continueReduce=false, const ElemType learnRateDecreaseFactor = 0.618f, floatargvector dropoutRates = floatargvector(L"0.0f"), + const bool loadBestModel=true, const intargvector& numMiniBatch4LRSearch=intargvector(L"500"), const size_t numPrevLearnRates = 5, + const size_t numBestSearchEpoch = 1, const int traceLevel = 0, + const size_t numMBsToShowResult = 10, const size_t maxTempMemSizeInSamplesForCNN = 0, + const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None, + const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"", + const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true, + RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed=true) + { + numPrevLearnRates; + m_mbSize=mbSize; + m_epochSize=epochSize; + if (m_epochSize == 0) + { + m_epochSize = requestDataSize; + } + m_maxEpochs=maxEpochs; + + m_gradientClippingWithTruncation=gradientClippingWithTruncation; + m_modelPath=modelPath; + m_autoLearnRateSearchType=autoLearnRateSearchType; + m_traceLevel=traceLevel; + m_loadBestModel=loadBestModel; + m_increaseLearnRateIfImproveMoreThan=increaseLearnRateIfImproveMoreThan; + m_learnRateIncreaseFactor=learnRateIncreaseFactor; + m_reduceLearnRateIfImproveLessThan=reduceLearnRateIfImproveLessThan; + m_continueReduce=continueReduce; + m_learnRateAdjustInterval = max(1, learnRateAdjustInterval); //minimum interval is 1 epoch + m_learnRateDecreaseFactor=learnRateDecreaseFactor; + m_clippingThresholdPerSample=abs(clippingThresholdPerSample); + m_numMiniBatch4LRSearch=numMiniBatch4LRSearch; + m_dropoutRates=dropoutRates; + m_numMBsToShowResult=int(numMBsToShowResult); + m_numBestSearchEpoch=numBestSearchEpoch; + m_maxTempMemSizeInSamplesForCNN=maxTempMemSizeInSamplesForCNN; + m_gradType = gradUpdateType; + m_rpi = rpi; + m_usePtask = usePtask; + m_keepCheckPointFiles = keepCheckPointFiles; + + m_adaptationRegType = adaptationRegType; + m_adaptationRegWeight = adaptationRegWeight; + + m_trainCriterionNodeName = trainCriterionNodeName; + m_evalCriterionNodeName = evalCriterionNodeName; + m_useAllDataForPreComputedNode = UsingAllDataForPreComputed; + + for (size_t i=0; i 0 && learningRatesPerMB.size() > 0) + { + throw std::invalid_argument ("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them."); + } + else if (learningRatesPerSample.size() > 0) + { + m_learningRatesPerSample=learningRatesPerSample; + } + else if (learningRatesPerMB.size() > 0) + { + int LRSize = (int)max(learningRatesPerMB.size(), m_mbSize.size()); + m_learningRatesPerSample.resize(LRSize); + for (int i=0; i0) + { + m_momentumInputPerMB=momentumPerMB; + if (m_momentumInputPerMB[0]>=1 || m_momentumInputPerMB[0]<0) + throw std::invalid_argument ("momentumPerMB must be in [0, 1)."); + } + + if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor<1) + { + throw std::invalid_argument ("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1."); + } + + for (size_t i=0; i= 1 || m_dropoutRates[i] < 0) + { + throw std::invalid_argument ("dropoutRate must be >= 0 and < 1."); + } + } + + if (m_adaptationRegWeight > 1 || m_adaptationRegWeight <0) + throw invalid_argument("adaptationRegWeight must be in [0 1]"); + + m_minLearnRate = 1e-9f; + + m_needRegularization = false; + + m_doGradientCheck = doGradientCheck; + m_gradientCheckSigDigit = gradientCheckSigDigit; + m_validateAfterModelReloading = validateAfterModelReloading; + + msra::files::make_intermediate_dirs (m_modelPath); + } + + void Adapt(wstring origModelFileName, wstring refNodeName, IDataReader* trainSetDataReader, IDataReader* validationSetDataReader, const DEVICEID_TYPE deviceID, const bool makeMode = true) + { + if (origModelFileName == L"" || trainSetDataReader == nullptr) + throw std::invalid_argument ("origModel and trainSetDataReader should not be null."); + + int startEpoch = DetermineStartEpoch(makeMode); + if (startEpoch == m_maxEpochs) + { + fprintf(stderr, "Final model exists. No further training is necessary.\n"); + return; + } + + ComputationNetwork net(deviceID); + if (startEpoch >= 0) + { + wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1); + fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str()); + net.LoadFromFile(modelFileName); + } + else + { + fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str()); + net.LoadFromFile(origModelFileName); + } + + startEpoch = max(startEpoch, 0); + + ComputationNetwork refNet(deviceID); + m_needRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0; + if (m_needRegularization) + { + fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str()); + refNet.LoadFromFile(origModelFileName); + } + + ComputationNodePtr refNode = nullptr; + if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL) + { + fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str()); + if (refNodeName == L"") + throw invalid_argument("refNodeName does not exist and is needed when adaptationRegType is KL."); + + refNode = refNet.GetNodeFromName(refNodeName); + } + + TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader); + } + + void Train(IComputationNetBuilder* netBuilder, IDataReader* trainSetDataReader, IDataReader* validationSetDataReader, const bool makeMode = true) + { + if (netBuilder == nullptr || trainSetDataReader == nullptr) + throw std::invalid_argument ("netBuilder and trainSetDataReader should not be null.\n"); + + int startEpoch = DetermineStartEpoch(makeMode); + if (startEpoch == m_maxEpochs) + { + fprintf(stderr, "Final model exists. No further training is necessary.\n"); + return; + } + + wstring modelFileName = GetModelNameForEpoch(int(startEpoch)-1); + if (startEpoch >= 0) + fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str()); + ComputationNetwork& net = + startEpoch<0? netBuilder->BuildNetworkFromDescription() : netBuilder->LoadNetworkFromFile(modelFileName); + // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model + // strategy should be to run the initializer above on myRank==0, and then broadcast parameters. + + startEpoch = max(startEpoch, 0); + m_needRegularization = false; + + TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader); + } + + protected: + std::vector GetTrainCriterionNodes(ComputationNetwork& net) + { + fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str()); + if (!m_trainCriterionNodeName.empty()) + { + std::vector nodes; + ComputationNodePtr node = net.GetNodeFromName(m_trainCriterionNodeName); + net.ValidateNetwork(node); + if (node->FunctionValues().GetNumElements() != 1) + throw invalid_argument("the trainCriterionNodeName specified in the config file is not a valid training criterion node."); + + nodes.push_back(node); + return nodes; + } + else + return net.FinalCriterionNodes(); + } + std::vector GetEvalCriterionNodes(ComputationNetwork& net) + { + fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str()); + if (!m_evalCriterionNodeName.empty()) + { + std::vector nodes; + ComputationNodePtr node = net.GetNodeFromName(m_evalCriterionNodeName); + net.ValidateNetwork(node); + if (node->FunctionValues().GetNumElements() != 1) + throw invalid_argument("the evalCriterionNodeName specified in the config file is not a valid evaluation criterion node."); + + nodes.push_back(node); + return nodes; + } + else + return net.EvaluationNodes(); + } + + void TrainOrAdaptModel(int startEpoch, ComputationNetwork& net, ComputationNetwork& refNet, ComputationNodePtr refNode, + IDataReader* trainSetDataReader, IDataReader* validationSetDataReader) + { + std::vector & FeatureNodes = net.FeatureNodes(); + std::vector & labelNodes = net.LabelNodes(); + std::vector criterionNodes = GetTrainCriterionNodes(net); + std::vector evaluationNodes = GetEvalCriterionNodes(net); + + std::map*> inputMatrices; + for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); + } + for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); + } + + // special handling of classed based softmax node. Need a better solution to it. + if (criterionNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode::TypeName() || + evaluationNodes[0]->OperationName() == ClassBasedCrossEntropyWithSoftmaxNode::TypeName()) + { + size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows(); + int deviceId = FeatureNodes[0]->FunctionValues().GetDeviceId(); + inputMatrices[L"idx2cls"] = new Matrix(vSz, 1, (DEVICEID_TYPE)deviceId); + inputMatrices[L"classinfo"] = new Matrix(vSz, 1, (DEVICEID_TYPE)deviceId); + } + + + //used for KLD regularized adaptation. For all other adaptation techniques use MEL to edit the model and using normal training algorithm + std::vector refFeatureNodes; + if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) + { + refFeatureNodes.resize(FeatureNodes.size()); + for (size_t i=0; iNodeName()); //we need to keep this info to handle deletion + refNet.ChangeNode(FeatureNodes[i]->NodeName(), FeatureNodes[i]); + } + + refNet.RebuildNetwork(refNode); + } + + //initializing weights and gradient holder + std::list& learnableNodes = net.LearnableNodes(criterionNodes[0]); //only one criterion so far TODO: support multiple ones? + std::list> smoothedGradients; + + for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) + { + ComputationNodePtr node = (*nodeIter); + smoothedGradients.push_back(Matrix(node->FunctionValues().GetNumRows(), node->FunctionValues().GetNumCols(),net.GetDeviceID())); + } + + ElemType epochCriterion, avgCriterion, prevCriterion; + epochCriterion = avgCriterion = prevCriterion = std::numeric_limits::infinity(); + size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval; + + std::vector epochEvalErrors(evaluationNodes.size(),std::numeric_limits::infinity()); + + std::vector evalNodeNames; + for (size_t i=0;iNodeName()); + + size_t totalSamplesSeen = 0; + ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch]; + + int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation + vector prevLearnRates; + prevLearnRates.resize(m_numPrevLearnRates); + for (int i=0; i::infinity(); + + //precompute mean and invStdDev nodes and save initial model + if (PreCompute(net, trainSetDataReader, FeatureNodes, labelNodes, inputMatrices) || startEpoch == 0) + if (0 == myRank) // only needs to be done by one process + net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1)); + + // first, we need to normalize the effect of nbruttsineachrecurrentiter + if (trainSetDataReader->NumberSlicesInEachRecurrentIter()>1 && m_needToNormalizeLRByParallUtterance) + { + for (auto & x : m_learningRatesPerSample) + { + x /= trainSetDataReader->NumberSlicesInEachRecurrentIter(); + } + } + bool learnRateInitialized = false; + if (startEpoch > 0) + { + learnRateInitialized = LoadCheckPointInfo(startEpoch-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); + setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size()-1]); + } + + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch) + throw std::invalid_argument ("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); + + unsigned long dropOutSeed = 1; + ElemType prevDropoutRate = 0; + + bool learnRateReduced = false; + + SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN); + if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) + SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN); + + // build the PTask graph if they want to use ptask + // NOTE: the graph is currently only for training, so other operations will still use the usual method, + // (i.e rate adjustment and other custom operations still use the non PTask method) + if (m_usePtask) + { + // set the minibatch size to the largest thing we will ever see + int maxMbSize = 0; + for (int val : m_mbSize) + { + maxMbSize = max(val, maxMbSize); + } + net.SetActualMiniBatchSize(maxMbSize); + net.BuildPTaskGraph(); + } + + for (int i = int(startEpoch); i < int(m_maxEpochs); i++) + { + auto t_start_epoch = Timer::MilliSecondElapsed(); + + // set other information to inputMatrices that can contrain information + // used for class-based LM for clustring information + SetOtherInfo(net, trainSetDataReader, validationSetDataReader, inputMatrices); + + //set dropout rate + SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); + + //learning rate adjustment + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) + { + learnRatePerSample = m_learningRatesPerSample[i]; + setMomentum(m_momentumInputPerMB[i]); + } + else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) + { + ElemType largestPrevLearnRatePerSample = prevLearnRates[0]; + for (int j = 1; j < m_numPrevLearnRates; j++) + { + largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]); + } + + //return a reasonable learning rate based on the initial mbsize + learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes, + labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample); + + prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample; //save per sample learn rate to support changeable mbsize + } + + learnRateInitialized = true; + + if (learnRatePerSample < m_minLearnRate) + { + fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); + if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None) + net.SaveToFile(m_modelPath); + break; + } + +#ifdef MPI_SUPPORT + INT32 mySamples = (INT32) +#endif + fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n", (int)startEpoch, learnRatePerSample, m_momentumPerMB); + TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes, + criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, + epochCriterion, epochEvalErrors, totalSamplesSeen); + + auto t_end_epoch = Timer::MilliSecondElapsed(); + ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC); + + fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion); + if (epochEvalErrors.size() == 1) + { + fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); + } + else + { + fprintf(stderr, "EvalErr Per Sample "); + for (size_t j = 0; j < epochEvalErrors.size(); j++) + fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); + fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime); + fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion); + for (size_t j = 0; j < epochEvalErrors.size(); j++) + fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]); + } + +#ifdef MPI_SUPPORT + // model reduction and averaging + if (numProcs > 0) + { + ElemType factor; // weight for the parameter of my model + { + // compute total minibatch size + INT32 allSamples = 0; + MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + if (allSamples == 0) allSamples = 1; + + factor = (ElemType)mySamples / (ElemType)allSamples; + } + + for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) + { + ComputationNodePtr node = (*nodeIter); + Microsoft::MSR::CNTK::Matrix &mat = node->FunctionValues(); + + // weight model by relative size of minibatch samples (and number of processors, for averaging) + ElemType *px = mat.CopyToArray(); + size_t nx = mat.GetNumElements(); + transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; }); + + // TODO: Replace default Allreduce with the reduction-shuffle-dance + vector py = vector(nx, ElemType(0)); + MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0])); + delete px; + } + } +#endif + + if ( 0 == myRank ) // only evaluate once, on the master process. TODO: This could be faster by farming out the validation parts + if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr) + { + SimpleEvaluator evalforvalidation(net); + vector cvSetTrainAndEvalNodes; + cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName()); + cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); + + vector vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); + fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n", + i + 1, vScore[0], vScore[1]); + + epochCriterion = vScore[0]; //the first one is the training criterion. + } +#ifdef MPI_SUPPORT + // ensure all processes have the same epochCriterion + MPI_Bcast(&epochCriterion, 1, sizeof(epochCriterion) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD); +#endif + + bool loadedPrevModel = false; + size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1; + if (avgCriterion == std::numeric_limits::infinity()) + avgCriterion = epochCriterion; + else + avgCriterion = ((epochsSinceLastLearnRateAdjust -1 - epochsNotCountedInAvgCriterion)* avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion); + + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) + { + if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity()) + { + if (m_loadBestModel) + { + net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i-1), m_validateAfterModelReloading); + net.ResetEvalTimeStamp(); + LoadCheckPointInfo(i-1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); + fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); + loadedPrevModel = true; + } + } + + if(m_continueReduce) + { + if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) + { + if(learnRateReduced == false) + { + learnRateReduced = true; + } + else + { + if ( myRank == 0 ) + net.SaveToFile(GetModelNameForEpoch(i, true)); + fprintf(stderr, "Finished training and saved final model\n\n"); + break; + } + } + if(learnRateReduced) + { + learnRatePerSample *= m_learnRateDecreaseFactor; + fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); + } + } + else + { + if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) + { + + learnRatePerSample *= m_learnRateDecreaseFactor; + fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); + } + else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan*prevCriterion && prevCriterion != std::numeric_limits::infinity()) + { + learnRatePerSample *= m_learnRateIncreaseFactor; + fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample); + } + } + } + + if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them + { + prevCriterion = avgCriterion; + epochsNotCountedInAvgCriterion = 0; + } + + //persist model and check-point info + if (0 == myRank) + { + net.SaveToFile(GetModelNameForEpoch(i)); + SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); + if (!m_keepCheckPointFiles) + _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space + } + + if (learnRatePerSample < 1e-12) + fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); + } + + if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //since we linked feature nodes. we need to remove it from the deletion + { + for (size_t i=0; iNodeName(), refFeatureNodes[i]); //note we need to handle deletion carefully + } + } + + if (inputMatrices[L"classinfo"]) + { + delete inputMatrices[L"classinfo"]; + inputMatrices.erase(L"classinfo"); + } + if (inputMatrices[L"idx2cls"]) + { + delete inputMatrices[L"idx2cls"]; + inputMatrices.erase(L"idx2cls"); + } + + } + + protected: + + //return true if precomputation is executed. + bool PreCompute(ComputationNetwork& net, + IDataReader* trainSetDataReader, + std::vector& FeatureNodes, + std::vector& labelNodes, + std::map*>& inputMatrices) + { + std::list nodes = net.GetNodesRequirePreComputation(); + + if (nodes.size() == 0) + { + fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n"); + return false; + } + + fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size()); + for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) + { + PreComputedNode* node = static_cast*> (*nodeIter); + fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str()); + } + + //compute + //trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize); + // trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch + // [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing + if (m_useAllDataForPreComputedNode) + trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); // using all the data + else + trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize); // using all the data + + while (trainSetDataReader->GetMinibatch(inputMatrices)) + { + UpdateEvalTimeStamps(FeatureNodes); + UpdateEvalTimeStamps(labelNodes); + + size_t actualMBSize = net.GetActualMBSize(); + net.SetActualMiniBatchSize(actualMBSize); + net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); + trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); + + for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++) + { + net.Evaluate( *nodeIter); + } + } + + //mark done + for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++) + { + PreComputedNode* node = static_cast*> (*nodeIter); + node->MarkComputed(true); + } + + return true; + } + + //return a reasonable initial learning rate based on the initial mbsize + ElemType SearchLearnRateBeforeEpoch(ComputationNetwork& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, + const int epochNumber, const ElemType curLearnRate, + IDataReader* trainSetDataReader, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, const bool /*learnRateInitialized*/, const ElemType largestPrevLearnRatePerSample) + { + ElemType epochCriterion = std::numeric_limits::infinity(), prevCriterion = std::numeric_limits::infinity(); + vector epochEvalErrors(evaluationNodes.size(),std::numeric_limits::infinity()); + //ElemType epochEvalError = std::numeric_limits::infinity(); + size_t totalSamplesSeen = 0; + ElemType bestLearnRatePerSample = curLearnRate; + + size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; + if (m_epochSize != requestDataSize) + { + epochSize = min(epochSize, m_epochSize); //use a small number minibatches to make decision + } + + ElemType baseCriterion; + + ElemType minLearnRate = m_minLearnRate * 0.3f; + ElemType learnRatePerSample = 1.0f / 8.0f / 0.618f /sqrt((ElemType)m_mbSize[epochNumber]); + + if (largestPrevLearnRatePerSample != std::numeric_limits::infinity()) + learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f; //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety + + int baseModelEpoch = epochNumber-1; + net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading); + net.ResetEvalTimeStamp(); + + ElemType learnRate =learnRatePerSample; + LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion); + + //if model is not changed this is what we will get + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, 0, + FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, + smoothedGradients, baseCriterion, epochEvalErrors, totalSamplesSeen); + + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) + { + if (prevCriterion == std::numeric_limits::infinity()) + prevCriterion = baseCriterion; + ElemType ratio = 0.3f; + if (m_epochSize != requestDataSize) + { + ratio = pow(((ElemType)epochSize) / m_epochSize, 1.0f/2); + } + baseCriterion = max(ratio * prevCriterion + (1-ratio) * baseCriterion, baseCriterion); + } + + do + { + learnRatePerSample *= 0.618f; + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample, + FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, + smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen); + + } while (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate); + + bestLearnRatePerSample = learnRatePerSample; + + if (epochNumber < m_numBestSearchEpoch) //grid search for the first m_numBestSearchEpoch epochs + { + ElemType leftLearnRatePerSample = 0.01f / m_mbSize[epochNumber], rightLearnRatePerSample = learnRatePerSample; + ElemType leftCriterion, rightCriterion = epochCriterion; + + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample, + FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, + smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen); + + while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f) + { + if (rightCriterion > leftCriterion) + { + rightLearnRatePerSample *= 0.618f; + + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, rightLearnRatePerSample, + FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, + smoothedGradients, rightCriterion, epochEvalErrors, totalSamplesSeen); + } + else + { + leftLearnRatePerSample /= 0.618f; + + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, leftLearnRatePerSample, + FeatureNodes,labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, + smoothedGradients, leftCriterion, epochEvalErrors, totalSamplesSeen); + } + } + + bestLearnRatePerSample = (leftCriterion < rightCriterion)? leftLearnRatePerSample : rightLearnRatePerSample; + } + + fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n", epochNumber+1, bestLearnRatePerSample, baseCriterion); + + return bestLearnRatePerSample; + } + + void TrainOneMiniEpochAndReloadModel(ComputationNetwork& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, + const int epochNumber,const size_t epochSize, IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + ElemType& epochCriterion, std::vector& epochEvalErrors, size_t& totalSamplesSeen) + { + TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes, + criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients, + epochCriterion, epochEvalErrors, totalSamplesSeen); + fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g ", epochCriterion); + if (epochEvalErrors.size()==1) + fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g\n", epochEvalErrors[0], learnRatePerSample); + else + { + fprintf(stderr, "EvalErr Per Sample "); + for (size_t i=0; i& net, ComputationNetwork& refNet, const ComputationNodePtr refNode, + const int epochNumber, const size_t epochSize, + IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + ElemType& epochCriterion, std::vector& epochEvalErrors, size_t& totalSamplesSeen) + { + ElemType readTimeInMBs = 0, ComputeTimeInMBs = 0, epochCriterionLastMBs = 0; + int numSamplesLastMBs = 0; + std::vector epochEvalErrorsLastMBs(epochEvalErrors.size(),0); + PTaskGraphBuilder* ptaskGraphBuilder = NULL; + + unsigned long long startReadMBTime = 0, startComputeMBTime=0; + unsigned long long endReadMBTime = 0, endComputeMBTime = 0; + + //initialize statistics + size_t totalEpochSamples = 0; + + int numMBsRun = 0; + bool beginEpoch = true; + + size_t numEvalNodes = epochEvalErrors.size(); + + // NOTE: the following two local matrices are not used in PTask path + Matrix localEpochCriterion(1,1,net.GetDeviceID()); //assume only one training criterion node for each epoch + Matrix localEpochEvalErrors(1,numEvalNodes,net.GetDeviceID()); + + localEpochCriterion.SetValue(0); + localEpochEvalErrors.SetValue(0); + + if (m_usePtask) + { + epochCriterion = ElemType(0.0); + epochEvalErrors.assign(numEvalNodes, ElemType(0.0)); + } + + trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); + + // build the PTask graph if they want to use ptask + // NOTE: the graph is currently only for training, so other operations will still use the usual method, + // (i.e rate adjustment, regularization and other custom operations still use the non PTask method) + if (m_usePtask) + { + ptaskGraphBuilder = net.GetPTaskGraphBuilder(); + ptaskGraphBuilder->UpdateParameters(this, learnRatePerSample, m_mbSize[epochNumber]); + ptaskGraphBuilder->StartPTaskGraph(); + + // currently CNTK likes to keep things on the GPU, and PTask expects things to be on the CPU, so tell CNTK to keep data on the CPU + for (std::pair*> inpair : inputMatrices) + { + Matrix* mat = inpair.second; + mat->SetPreferredDeviceId(CPUDEVICE); + mat->TransferFromDeviceToDevice(mat->GetDeviceId(), CPUDEVICE, true); + } + } + + startReadMBTime=Timer::MilliSecondElapsed(); + while (trainSetDataReader->GetMinibatch(inputMatrices)) + { +#ifdef MPI_SUPPORT + DecimateMinibatch(inputMatrices); +#endif + endReadMBTime=Timer::MilliSecondElapsed(); + startComputeMBTime=Timer::MilliSecondElapsed(); + + UpdateEvalTimeStamps(FeatureNodes); + UpdateEvalTimeStamps(labelNodes); + + size_t actualMBSize = net.GetActualMBSize(); + if (0 == actualMBSize) + continue; + + net.SetActualMiniBatchSize(actualMBSize); + net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); + trainSetDataReader->SetSentenceEndInBatch(net.m_sentenceEnd); + +#ifndef EVALDLL + if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false) + { + throw std::logic_error("cannot pass gradient checker"); + } +#endif + if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) //TODO: currently only support one node regularization + { + refNet.SetActualMiniBatchSize(actualMBSize); + refNet.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); + refNet.Evaluate(refNode); + Matrix::ScaleAndAdd(m_adaptationRegWeight, refNode->FunctionValues(), 1-m_adaptationRegWeight, labelNodes[0]->FunctionValues()); + } + + if (m_usePtask) + { + // Pushing data in the graph starts things going + bool endOfEpoch = trainSetDataReader->DataEnd(endDataEpoch); + CONTROLSIGNAL signal = beginEpoch?DBCTLC_BOF:DBCTLC_NONE; + if (endOfEpoch) + signal |= DBCTLC_EOF; + + ptaskGraphBuilder->PushData(inputMatrices, signal); + ptaskGraphBuilder->PushActualMBSize(learnableNodes, net.GetActualMBSize(), signal); + beginEpoch = false; // clear this out after first epoch + + // pull the values from the graph for the totals + epochCriterion += ptaskGraphBuilder->GetValue(criterionNodes[0]); + for (size_t i=0; iGetValue(evaluationNodes[i]); + } + + // NOTE: update model parameters is part of the graph, so nothing to do here + } + else + { + if (learnRatePerSample > m_minLearnRate * 0.01) //only compute gradient when learning rate is large enough + net.ComputeGradient(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more? + else + net.Evaluate(criterionNodes[0]); //use only the first criterion. Is there any possibility to use more? + + Matrix::AddElementToElement(criterionNodes[0]->FunctionValues(), 0, 0, localEpochCriterion, 0, 0); + + std::vectormbEvalErrors(numEvalNodes,0); + for (size_t i=0; i::AddElementToElement(evaluationNodes[i]->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i); + } + + //update model parameters + if (learnRatePerSample > m_minLearnRate * 0.01) + { + auto smoothedGradientIter=smoothedGradients.begin(); + for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++) + { + ComputationNodePtr node = (*nodeIter); + Matrix& smoothedGradient = (*smoothedGradientIter); + + UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber]); + } + } + } + + + endComputeMBTime=Timer::MilliSecondElapsed(); + numMBsRun ++; + if (m_traceLevel > 0) + { + ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(MS_PER_SEC); + ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/MS_PER_SEC; + + readTimeInMBs += MBReadTime; + ComputeTimeInMBs += MBComputeTime; + numSamplesLastMBs += int(actualMBSize); + + if (numMBsRun % m_numMBsToShowResult == 0) + { + if (!m_usePtask) + { // get the epoch Values updated, in PTask don't use the loclEpoch* temporary matrices + epochCriterion = localEpochCriterion.Get00Element(); + for (size_t i=0; i< numEvalNodes; i++) + epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i); + } + + fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d Train Loss Per Sample = %.8g ",epochNumber+1, numMBsRun-m_numMBsToShowResult+1, numMBsRun, numSamplesLastMBs, + (epochCriterion-epochCriterionLastMBs)/numSamplesLastMBs); + for (size_t i=0; i= epochSize) + break; + + /// call DataEnd function + /// DataEnd does reader specific process if sentence ending is reached + trainSetDataReader->DataEnd(endDataSentence); + + } + + if (m_usePtask) + { + // when the epoch is complete, we need to transfer all the values back to the LearnableNodes, which will be saved off as the model + std::list learnableNodes = net.LearnableNodes(criterionNodes[0]); + for (ComputationNodePtr node : learnableNodes) + { + ptaskGraphBuilder->GetValue(node, node->FunctionValues()); + } + epochCriterion /= float(totalEpochSamples); + for (size_t i=0; i< numEvalNodes; i++) + { + epochEvalErrors[i] /= float(totalEpochSamples); + } + } + else + { + localEpochCriterion /= float(totalEpochSamples); + localEpochEvalErrors /= float(totalEpochSamples); + + epochCriterion = localEpochCriterion.Get00Element(); + for (size_t i=0; i< numEvalNodes; i++) + { + epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i); + } + } + return totalEpochSamples; + } +public: + // UpdateWeightsS - static version of UpdateWeights() + static void UpdateWeightsS(const SGD* sgd, Matrix& functionValues, Matrix& gradientValues, Matrix& smoothedGradient, const ElemType learnRatePerSample, size_t actualMBSize, const size_t expectedMBSize) + { +#if DUMPOUTPUT + fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",learnRatePerSample, actualMBSize, expectedMBSize); + fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB()); + gradientValues.Print("Gradient Input"); + smoothedGradient.Print("Smoothed Gradient Input"); +#endif + + // make actualMBSize is a valid value + assert(actualMBSize > 0); + + //clipping gradients to prevent outliers + sgd->ClipGradient(gradientValues, actualMBSize); + + GradientsUpdateType adpType = sgd->GradUpdateType(); + ElemType noiseStd = sgd->GradientUpdateNoiseStd(); + Matrix sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId()); + if (noiseStd > 0) + { + sgdUpdateNoise.SetValue(gradientValues); /// get the gradient structure since gradient is sparse + sgdUpdateNoise.SetGaussianRandomValue(0, noiseStd); // reset its value to random + } + + if (adpType == GradientsUpdateType::None) + { + ElemType momentum = sgd->MomentumPerMB(); + if (actualMBSize < expectedMBSize && momentum > 0.0000001f) //we use simple linear (instead of log linear) scaling here + { + momentum = (ElemType) exp (log(momentum)/expectedMBSize * actualMBSize); + } + smoothedGradient.NormalGrad(gradientValues, functionValues, learnRatePerSample, momentum); + } + if (adpType == GradientsUpdateType::AdaGrad) + { + smoothedGradient.Adagrad(gradientValues); + Matrix::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues); + } + if (adpType == GradientsUpdateType::RmsProp) + { + // include L2 regularizer + Matrix::ScaleAndAdd((ElemType)0.001, functionValues, gradientValues); + smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma, (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max, (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min); + Matrix::ScaleAndAdd(-learnRatePerSample, gradientValues, functionValues); + } + + if (noiseStd > 0) + { + Matrix::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues); + } +#if DUMPOUTPUT + functionValues.Print("Parameter Update"); +#endif + } +protected: + // UpdateWeights - update the weights in + void UpdateWeights(const ComputationNodePtr node, Matrix& smoothedGradient, const ElemType learnRatePerSample, const size_t actualMBSize, const size_t expectedMBSize) const + { +#if DUMPOUTPUT + fprintf(stderr, "Update_%ls\n",node->NodeName().c_str()); +#endif + UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(), smoothedGradient, learnRatePerSample, actualMBSize, expectedMBSize); + node->UpdateEvalTimeStamp(); + } + + void ClipGradient(Matrix& gradient, const size_t actualMBSize) const + { + if (m_clippingThresholdPerSample != std::numeric_limits::infinity()) + { + ElemType maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize; + if (m_gradientClippingWithTruncation) + { + gradient.InplaceTruncate(maxGradientPerMB); + } + else //norm2 normalized + { + ElemType gradientNorm = gradient.FrobeniusNorm(); + if (gradientNorm > maxGradientPerMB) + { + ElemType normFactor = maxGradientPerMB / gradientNorm; + gradient *= normFactor; + } + } + } + } + + void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, + const std::list>& smoothedGradients, const ElemType prevCriterion) + { + wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); + + File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite); + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP"); + + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate"); + fstream << totalSamplesSeen << learnRatePerSample << prevCriterion; + fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); + + for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) + { + const Matrix& smoothedGradient = (*smoothedGradientIter); + fstream << smoothedGradient; + } + fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient"); + + fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP"); + } + + bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, ElemType& learnRatePerSample, + std::list>& smoothedGradients, ElemType& prevCriterion) + { + wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); + if (!fexists(checkPointFileName.c_str()) ) + { + fprintf(stderr, "Warning: checkpiont file is missing. learning parameters will be initialized from 0\n"); + return false; + } + + File fstream(checkPointFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead); + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP"); + + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate"); + fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion; + fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); + + for (auto smoothedGradientIter=smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) + { + Matrix& smoothedGradient = (*smoothedGradientIter); + fstream >> smoothedGradient; + } + fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient"); + + fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP"); + + return true; + } + + wstring GetCheckPointFileNameForEpoch (const int epoch) + { + return GetModelNameForEpoch (epoch) + L".ckp"; + } + + wstring GetModelNameForEpoch (const int epoch, bool bLastModel = false) + { + int epoch1Base = epoch + 1; + if (epoch1Base == m_maxEpochs || bLastModel) + return m_modelPath; + else + return msra::strfun::wstrprintf (L"%s.%d", m_modelPath.c_str(), (int) epoch1Base); + } + + //return -1 if nothing exists + int DetermineStartEpoch (const bool makeMode) + { + if (!makeMode) + return -1; //always start from scratch + + int firstEpoch = -1; + + wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs)-1); + for (int e = int(m_maxEpochs)-1; e >= -1; e--) + { + const wstring prevEpochFile = GetModelNameForEpoch (e-1); + + if (msra::files::fuptodate (curEpochFile, prevEpochFile, false)) + { + firstEpoch = size_t(e)+1; + break; + } + else + curEpochFile = prevEpochFile; + } + + return firstEpoch; + } + + AdaptationRegType ParseAdaptationRegType(wstring s) + { + msra::strfun::tolower_ascii(s); + if (s == L"" || s == L"none") + return AdaptationRegType::None; + else if (s == L"kl" || s == L"klreg" ) + return AdaptationRegType::KL; + else + throw std::invalid_argument( + "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are " + "(None | KL)"); + } + + GradientsUpdateType ParseGradUpdateType(wstring s) + { + msra::strfun::tolower_ascii(s); + if (s == L"" || s == L"none") + return GradientsUpdateType::None; + else if (s == L"adagrad") + return GradientsUpdateType::AdaGrad; + else if (s == L"rmsprop") + return GradientsUpdateType::RmsProp; + else + throw std::invalid_argument( + "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are " + "(None | AdaGrad | RmsProp )"); + } + + LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s) + { + msra::strfun::tolower_ascii(s); + if (s == L"false" || s == L"none") + return LearningRateSearchAlgorithm::None; + else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before") + return LearningRateSearchAlgorithm::SearchBeforeEpoch; + else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after") + return LearningRateSearchAlgorithm::AdjustAfterEpoch; + else + throw std::invalid_argument( + "autoAdjustLR: Invalid learning rate search type. Valid values are " + "(None | SearchBeforeEpoch | AdjustAfterEpoch)"); + } + + GradientsUpdateType GradUpdateType() const {return m_gradType.mType;} + ElemType GradientUpdateNoiseStd() const {return m_gradType.mGaussianNoiseInjectStd;} + ElemType MomentumPerMB() const {return m_momentumPerMB;} + + public: + #define EPSILON 1e-5 + + bool GradientCheck( + ComputationNetwork& net, + const std::vector& criterionNodes, + const std::list& learnableNodes, + int npos) + { + // gradient checking + for (auto nodeIter=learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) + { + ComputationNodePtr node = (*nodeIter); + + int irow = (int)fmod(rand(), node->FunctionValues().GetNumRows()-1); + int icol = (int)fmod(rand(), node->FunctionValues().GetNumCols()-1); + irow = max(0, irow); + icol = max(0, icol); + + fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str()); + // node->FunctionValues().Print(); + ElemType eOrg = node->FunctionValues()(irow,icol); + + node->UpdateEvalTimeStamp(); + net.ComputeGradient(criterionNodes[npos]); //use only the first criterion. Is + //ElemType mbEvalCri = + criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar + ElemType eGradErr = node->GradientValues()(irow, icol); + + ElemType ePos = eOrg + ElemType(EPSILON); + ElemType eNeg = eOrg - ElemType(EPSILON); + + node->FunctionValues()(irow, icol) = ePos; + node->UpdateEvalTimeStamp(); + net.Evaluate(criterionNodes[npos]); + ElemType mbEvalCriPos = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar + + node->FunctionValues()(irow, icol) = eNeg; + node->UpdateEvalTimeStamp(); + net.Evaluate(criterionNodes[npos]); + ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar + + // back to its orginal parameter value + node->FunctionValues()(irow, icol) = eOrg; + + // check if they are consistent + ElemType eGradNum = (ElemType)((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg)); + ElemType threshold = (ElemType)pow((ElemType)10.0, max((ElemType)0.0, ceil(log10(min(fabs(eGradErr), fabs(eGradNum))))) - (int)m_gradientCheckSigDigit); + ElemType diff = (ElemType)fabs(eGradErr - eGradNum); + bool wrong = (std::isnan(diff) || diff > threshold); + if (wrong) + { + fprintf (stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n", node->NodeName().c_str(), eGradNum, eGradErr); + return false; + } + } + + return true; + } + + void SetOtherInfo(ComputationNetwork& net , IDataReader* /*trainSetDataReader*/, IDataReader* /*validSetDataReader*/, std::map*>& inputMatrices) + { + std::vector criterionNodes = net.FinalCriterionNodes(); + std::vector evaluationNodes = net.EvaluationNodes(); + + //initializing weights and gradient holder + for (size_t i = 0; i < criterionNodes.size(); i++) + { + if (criterionNodes[i]->OperationName() == L"ClassBasedCrossEntropyWithSoftmax") + { + ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) criterionNodes[i]; + crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); + } + } + + for (size_t i=0;iOperationName() == L"ClassBasedCrossEntropyWithSoftmax") + { + ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evaluationNodes[i]; + crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); + } + } + } + + protected: + + floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside + bool m_needToNormalizeLRByParallUtterance; // only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1 + intargvector m_mbSize; + size_t m_epochSize; + size_t m_maxEpochs; + floatargvector m_momentumInputPerMB; + ElemType m_momentumPerMB; + bool m_gradientClippingWithTruncation; + ElemType m_clippingThresholdPerSample; + + wstring m_modelPath; + wstring m_trainCriterionNodeName; + wstring m_evalCriterionNodeName; + + intargvector m_numMiniBatch4LRSearch; + size_t m_numBestSearchEpoch; + + LearningRateSearchAlgorithm m_autoLearnRateSearchType; + + AdaptationRegType m_adaptationRegType; + ElemType m_adaptationRegWeight; + bool m_needRegularization; + + bool m_loadBestModel; + ElemType m_reduceLearnRateIfImproveLessThan; + bool m_continueReduce; + size_t m_learnRateAdjustInterval; //determine after how many epochs the learning rate should be auto adjusted. + ElemType m_increaseLearnRateIfImproveMoreThan; + ElemType m_learnRateIncreaseFactor; + ElemType m_learnRateDecreaseFactor; + + floatargvector m_dropoutRates; + size_t m_maxTempMemSizeInSamplesForCNN; + + int m_traceLevel; + + size_t m_numPrevLearnRates; + + ElemType m_minLearnRate; + + GradientUpdateInfo m_gradType; + RMSPropInfo m_rpi; + + bool m_usePtask; + + bool m_keepCheckPointFiles; + + int m_numMBsToShowResult; + + bool m_doGradientCheck; + ElemType m_gradientCheckSigDigit; + + bool m_validateAfterModelReloading; + + bool m_useAllDataForPreComputedNode; + }; + template class SGD; + template class SGD; + +}}} diff --git a/MachineLearning/cn/SimpleEvaluator.h b/MachineLearning/cn/SimpleEvaluator.h index 6c53a09b9..977cefde6 100644 --- a/MachineLearning/cn/SimpleEvaluator.h +++ b/MachineLearning/cn/SimpleEvaluator.h @@ -1,350 +1,349 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#pragma once - -#include "ComputationNetwork.h" -#include "ComputationNetworkHelper.h" -#include "DataReader.h" -#include -#include -#include -#include "basetypes.h" -#include "fileutil.h" -#include "commandArgUtil.h" -#include - -using namespace std; - -namespace Microsoft { namespace MSR { namespace CNTK { - - template - class SimpleEvaluator : ComputationNetworkHelper - { - typedef ComputationNetworkHelper B; - using B::UpdateEvalTimeStamps; - protected: - typedef ComputationNode* ComputationNodePtr; - typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr; - - public: - - SimpleEvaluator(ComputationNetwork& net, const size_t numMBsToShowResult=100, const int traceLevel=0) - : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel) - { - } - - //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes) - vector Evaluate(IDataReader& dataReader, const vector& evalNodeNames, const size_t mbSize, const size_t testSize=requestDataSize) - { - //specify evaluation nodes - std::vector evalNodes; - - if (evalNodeNames.size() == 0) - { - fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n"); - if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0) - throw std::logic_error("There is no default evalnodes or training criterion node specified in the network."); - - for (int i=0; i< m_net.EvaluationNodes().size(); i++) - evalNodes.push_back(m_net.EvaluationNodes()[i]); - - for (int i=0; i< m_net.FinalCriterionNodes().size(); i++) - evalNodes.push_back(m_net.FinalCriterionNodes()[i]); - } - else - { - for (int i=0; iFunctionValues().GetNumElements() == 1) - { - throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value)."); - } - evalNodes.push_back(node); - } - } - - //initialize eval results - std::vector evalResults; - for (int i=0; i< evalNodes.size(); i++) - { - evalResults.push_back((ElemType)0); - evalNodes[i]->Reset(); - } - - //prepare features and labels - std::vector & FeatureNodes = m_net.FeatureNodes(); - std::vector & labelNodes = m_net.LabelNodes(); - - std::map*> inputMatrices; - for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); - } - for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); - } - - //evaluate through minibatches - size_t totalEpochSamples = 0; - size_t numMBsRun = 0; - size_t actualMBSize = 0; - size_t numSamplesLastMBs = 0; - size_t lastMBsRun = 0; //MBs run before this display - - std::vector evalResultsLastMBs; - for (int i=0; i< evalResults.size(); i++) - evalResultsLastMBs.push_back((ElemType)0); - - dataReader.StartMinibatchLoop(mbSize, 0, testSize); - dataReader.SetNbrSlicesEachRecurrentIter(1); - - for (int i=0; iOperationName() == L"ClassBasedCrossEntropyWithSoftmax") - { - size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows(); - if(inputMatrices.find(L"classinfo") == inputMatrices.end()) - { - inputMatrices[L"idx2cls"] = new Matrix(vSz, 1, m_net.GetDeviceID()); - inputMatrices[L"classinfo"] = new Matrix(vSz, 1, m_net.GetDeviceID()); - } - ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i]; - crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); - } - } - - while (dataReader.GetMinibatch(inputMatrices)) - { - UpdateEvalTimeStamps(FeatureNodes); - UpdateEvalTimeStamps(labelNodes); - - actualMBSize = m_net.GetActualMBSize(); - m_net.SetActualMiniBatchSize(actualMBSize); - m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter()); - dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); - - for (int i=0; iFunctionValues().Get00Element(); //criterionNode should be a scalar - } - - totalEpochSamples += actualMBSize; - numMBsRun++; - - if (m_traceLevel > 0) - { - numSamplesLastMBs += actualMBSize; - - if (numMBsRun % m_numMBsToShowResult == 0) - { - DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs); - - for (int i=0; i 0 && numSamplesLastMBs > 0) - { - DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs); - } - - //final statistics - for (int i=0; i& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize) - { - - std::vector FeatureNodes = m_net.FeatureNodes(); - std::vector labelNodes = m_net.LabelNodes(); - std::vector criterionNodes = m_net.FinalCriterionNodes(); - std::vector evaluationNodes = m_net.EvaluationNodes(); - - if (criterionNodes.size()==0) - { - throw std::runtime_error("No CrossEntropyWithSoftmax node found\n"); - } - if (evaluationNodes.size()==0) - { - throw std::runtime_error("No Evaluation node found\n"); - } - - std::map*> inputMatrices; - for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); - } - for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); - } - inputMatrices[L"numberobs"] = new Matrix(1,1, m_net.GetDeviceID()); - - dataReader.StartMinibatchLoop(mbSize, 0, testSize); - - ElemType epochEvalError = 0; - ElemType epochCrossEntropy = 0; - size_t totalEpochSamples = 0; - ElemType prevEpochEvalError = 0; - ElemType prevEpochCrossEntropy = 0; - size_t prevTotalEpochSamples = 0; - size_t prevStart = 1; - size_t numSamples = 0; - ElemType crossEntropy = 0; - ElemType evalError = 0; - - ofstream outputStream; - if (output) - { -#ifdef _MSC_VER - outputStream.open(output); -#else - outputStream.open(charpath(output)); // GCC does not implement wide-char pathnames here -#endif - } - - size_t numMBsRun = 0; - size_t actualMBSize = 0; - while (dataReader.GetMinibatch(inputMatrices)) - { - size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0); - actualMBSize = nbrSamples; - - for (int npos = 0; npos < nbrSamples ; npos++) - { - FeatureNodes[npos]->UpdateEvalTimeStamp(); - labelNodes[npos]->UpdateEvalTimeStamp(); - - m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more? - - m_net.Evaluate(evaluationNodes[npos]); - - ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar - epochCrossEntropy += mbCrossEntropy; - - ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar - - epochEvalError += mbEvalError; - } - - totalEpochSamples += actualMBSize; - - if (outputStream.is_open()) - { - //TODO: add support to dump multiple outputs - ComputationNodePtr outputNode = m_net.OutputNodes()[0]; - foreach_column(j, outputNode->FunctionValues()) - { - foreach_row(i,outputNode->FunctionValues()) - { - outputStream<FunctionValues()(i,j)<<" "; - } - outputStream< 0) - { - crossEntropy = epochCrossEntropy - prevEpochCrossEntropy; - evalError = epochEvalError - prevEpochEvalError; - fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", - prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples); - } - - //final statistics - epochEvalError /= (ElemType)totalEpochSamples; - epochCrossEntropy /= (ElemType)totalEpochSamples; - fprintf(stderr, "Overall: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy); - if (outputStream.is_open()) - { - outputStream.close(); - } - evalSetCrossEntropy = epochCrossEntropy; - return epochEvalError; - } - - protected: - void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector& evalNodes, - const vector & evalResults, const vector & evalResultsLastMBs) - { - fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu ", startMBNum, endMBNum, numSamplesLastMBs); - - for (size_t i=0; iNodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs); - } - - fprintf(stderr, "\n"); - } - - protected: - ComputationNetwork& m_net; - size_t m_numMBsToShowResult; - int m_traceLevel; - void operator=(const SimpleEvaluator&); // (not assignable) - }; - -}}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#pragma once + +#include "ComputationNetwork.h" +#include "ComputationNetworkHelper.h" +#include "DataReader.h" +#include +#include +#include +#include "basetypes.h" +#include "fileutil.h" +#include "commandArgUtil.h" +#include + +using namespace std; + +namespace Microsoft { namespace MSR { namespace CNTK { + + template + class SimpleEvaluator : ComputationNetworkHelper + { + typedef ComputationNetworkHelper B; + using B::UpdateEvalTimeStamps; + protected: + typedef ComputationNode* ComputationNodePtr; + typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr; + + public: + + SimpleEvaluator(ComputationNetwork& net, const size_t numMBsToShowResult=100, const int traceLevel=0) + : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel) + { + } + + //returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes) + vector Evaluate(IDataReader& dataReader, const vector& evalNodeNames, const size_t mbSize, const size_t testSize=requestDataSize) + { + //specify evaluation nodes + std::vector evalNodes; + + if (evalNodeNames.size() == 0) + { + fprintf (stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n"); + if (m_net.EvaluationNodes().size() == 0 && m_net.FinalCriterionNodes().size() == 0) + throw std::logic_error("There is no default evalnodes or training criterion node specified in the network."); + + for (int i=0; i< m_net.EvaluationNodes().size(); i++) + evalNodes.push_back(m_net.EvaluationNodes()[i]); + + for (int i=0; i< m_net.FinalCriterionNodes().size(); i++) + evalNodes.push_back(m_net.FinalCriterionNodes()[i]); + } + else + { + for (int i=0; iFunctionValues().GetNumElements() == 1) + { + throw std::logic_error("The nodes passed to SimpleEvaluator::Evaluate function must be either eval or training criterion nodes (which evalues to 1x1 value)."); + } + evalNodes.push_back(node); + } + } + + //initialize eval results + std::vector evalResults; + for (int i=0; i< evalNodes.size(); i++) + { + evalResults.push_back((ElemType)0); + evalNodes[i]->Reset(); + } + + //prepare features and labels + std::vector & FeatureNodes = m_net.FeatureNodes(); + std::vector & labelNodes = m_net.LabelNodes(); + + std::map*> inputMatrices; + for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); + } + for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); + } + + //evaluate through minibatches + size_t totalEpochSamples = 0; + size_t numMBsRun = 0; + size_t actualMBSize = 0; + size_t numSamplesLastMBs = 0; + size_t lastMBsRun = 0; //MBs run before this display + + std::vector evalResultsLastMBs; + for (int i=0; i< evalResults.size(); i++) + evalResultsLastMBs.push_back((ElemType)0); + + dataReader.StartMinibatchLoop(mbSize, 0, testSize); + + for (int i=0; iOperationName() == L"ClassBasedCrossEntropyWithSoftmax") + { + size_t vSz = FeatureNodes[0]->FunctionValues().GetNumRows(); + if(inputMatrices.find(L"classinfo") == inputMatrices.end()) + { + inputMatrices[L"idx2cls"] = new Matrix(vSz, 1, m_net.GetDeviceID()); + inputMatrices[L"classinfo"] = new Matrix(vSz, 1, m_net.GetDeviceID()); + } + ClassBasedCrossEntropyWithSoftmaxNodePtr crtNode = (ClassBasedCrossEntropyWithSoftmaxNodePtr) evalNodes[i]; + crtNode->AddClassInfo(inputMatrices[L"classinfo"], inputMatrices[L"idx2cls"]); + } + } + + while (dataReader.GetMinibatch(inputMatrices)) + { + UpdateEvalTimeStamps(FeatureNodes); + UpdateEvalTimeStamps(labelNodes); + + actualMBSize = m_net.GetActualMBSize(); + m_net.SetActualMiniBatchSize(actualMBSize); + m_net.SetActualNbrSlicesInEachRecIter(dataReader.NumberSlicesInEachRecurrentIter()); + dataReader.SetSentenceEndInBatch(m_net.m_sentenceEnd); + + for (int i=0; iFunctionValues().Get00Element(); //criterionNode should be a scalar + } + + totalEpochSamples += actualMBSize; + numMBsRun++; + + if (m_traceLevel > 0) + { + numSamplesLastMBs += actualMBSize; + + if (numMBsRun % m_numMBsToShowResult == 0) + { + DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs); + + for (int i=0; i 0 && numSamplesLastMBs > 0) + { + DisplayEvalStatistics(lastMBsRun+1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs); + } + + //final statistics + for (int i=0; i& dataReader, const size_t mbSize, ElemType &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize) + { + + std::vector FeatureNodes = m_net.FeatureNodes(); + std::vector labelNodes = m_net.LabelNodes(); + std::vector criterionNodes = m_net.FinalCriterionNodes(); + std::vector evaluationNodes = m_net.EvaluationNodes(); + + if (criterionNodes.size()==0) + { + throw std::runtime_error("No CrossEntropyWithSoftmax node found\n"); + } + if (evaluationNodes.size()==0) + { + throw std::runtime_error("No Evaluation node found\n"); + } + + std::map*> inputMatrices; + for (size_t i=0; iNodeName()] = &FeatureNodes[i]->FunctionValues(); + } + for (size_t i=0; iNodeName()] = &labelNodes[i]->FunctionValues(); + } + inputMatrices[L"numberobs"] = new Matrix(1,1, m_net.GetDeviceID()); + + dataReader.StartMinibatchLoop(mbSize, 0, testSize); + + ElemType epochEvalError = 0; + ElemType epochCrossEntropy = 0; + size_t totalEpochSamples = 0; + ElemType prevEpochEvalError = 0; + ElemType prevEpochCrossEntropy = 0; + size_t prevTotalEpochSamples = 0; + size_t prevStart = 1; + size_t numSamples = 0; + ElemType crossEntropy = 0; + ElemType evalError = 0; + + ofstream outputStream; + if (output) + { +#ifdef _MSC_VER + outputStream.open(output); +#else + outputStream.open(charpath(output)); // GCC does not implement wide-char pathnames here +#endif + } + + size_t numMBsRun = 0; + size_t actualMBSize = 0; + while (dataReader.GetMinibatch(inputMatrices)) + { + size_t nbrSamples = (size_t)(*inputMatrices[L"numberobs"])(0, 0); + actualMBSize = nbrSamples; + + for (int npos = 0; npos < nbrSamples ; npos++) + { + FeatureNodes[npos]->UpdateEvalTimeStamp(); + labelNodes[npos]->UpdateEvalTimeStamp(); + + m_net.Evaluate(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more? + + m_net.Evaluate(evaluationNodes[npos]); + + ElemType mbCrossEntropy = criterionNodes[npos]->FunctionValues().Get00Element(); // criterionNode should be a scalar + epochCrossEntropy += mbCrossEntropy; + + ElemType mbEvalError = evaluationNodes[npos]->FunctionValues().Get00Element(); //criterionNode should be a scalar + + epochEvalError += mbEvalError; + } + + totalEpochSamples += actualMBSize; + + if (outputStream.is_open()) + { + //TODO: add support to dump multiple outputs + ComputationNodePtr outputNode = m_net.OutputNodes()[0]; + foreach_column(j, outputNode->FunctionValues()) + { + foreach_row(i,outputNode->FunctionValues()) + { + outputStream<FunctionValues()(i,j)<<" "; + } + outputStream< 0) + { + crossEntropy = epochCrossEntropy - prevEpochCrossEntropy; + evalError = epochEvalError - prevEpochEvalError; + fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", + prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples); + } + + //final statistics + epochEvalError /= (ElemType)totalEpochSamples; + epochCrossEntropy /= (ElemType)totalEpochSamples; + fprintf(stderr, "Overall: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy); + if (outputStream.is_open()) + { + outputStream.close(); + } + evalSetCrossEntropy = epochCrossEntropy; + return epochEvalError; + } + + protected: + void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector& evalNodes, + const vector & evalResults, const vector & evalResultsLastMBs) + { + fprintf(stderr,"Minibatch[%lu-%lu]: Samples Seen = %lu ", startMBNum, endMBNum, numSamplesLastMBs); + + for (size_t i=0; iNodeName().c_str(), (evalResults[i]-evalResultsLastMBs[i])/numSamplesLastMBs); + } + + fprintf(stderr, "\n"); + } + + protected: + ComputationNetwork& m_net; + size_t m_numMBsToShowResult; + int m_traceLevel; + void operator=(const SimpleEvaluator&); // (not assignable) + }; + +}}} diff --git a/MachineLearning/cn/SynchronousExecutionEngine.h b/MachineLearning/cn/SynchronousExecutionEngine.h index cebec5069..94ffea8bf 100644 --- a/MachineLearning/cn/SynchronousExecutionEngine.h +++ b/MachineLearning/cn/SynchronousExecutionEngine.h @@ -1,780 +1,780 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - -#pragma once - -#include "IExecutionEngine.h" -#include "ComputationNetwork.h" -#include "fileutil.h" // for fexists() - -namespace Microsoft { namespace MSR { namespace CNTK { - -// SynchronousNodeEvaluator -// Process the Network Description Language into a Computation Network useable -// by SynchronousExecutionEngine. -template -class SynchronousNodeEvaluator : public NDLNodeEvaluator -{ -public: - // Constructor - create evaluator - SynchronousNodeEvaluator(ComputationNetwork& cn) : m_net(cn) - { } - - // Evaluate - evaluate a node and translate into underlying - // node - node we are evaluating - // baseName - base name for all symbols at this level - // pass - NDLPass through the evaluation (0-initial, 1-resolve variables, 2-final) - virtual void Evaluate(NDLNode* node, const wstring& baseName, const NDLPass pass) - { - // constants don't need to be evaluated, they just translate into numbers... - if (node->GetType() == ndlTypeConstant - || node->GetType() == ndlTypeArray) - return; - - // setup the node parameters, where they start in the parameter list, and how many there are - // this is needed for the ndlPassResolve step to hookup all the inputs - int nodeParamStart = 0; - int nodeParamCount = 0; - - // get the parameters - std::vector*> parameter = node->GetParameters(); - - // get the name for the symbol to be used by CN nodes - std::wstring name = msra::strfun::utf16(node->GetName()); - if (!baseName.empty()) - { - name = baseName + L"." + name; - } - - std::wstring cnNodeType = msra::strfun::utf16(node->GetValue()); - - ComputationNodePtr nodePtr = nullptr; - - // get the node pointer for the node, should be stored in the EvalValue; - if (pass > ndlPassInitial) - { - nodePtr = (ComputationNodePtr)node->GetEvalValue(); - if (nodePtr == nullptr) - { - nodePtr = (ComputationNodePtr)m_net.GetNodeFromName(name); - node->SetEvalValue(nodePtr); - } - } - - if (InputValue::TypeName() == cnNodeType) - { - if (parameter.size() < 1 || parameter.size() > 2) - RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str()); - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t rows = ((NDLNode*)params[0])->GetScalar(); - size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; - - // first look for this node already existing in the network - if (m_net.NodeNameExist(name)) - nodePtr = m_net.GetNodeFromName(name); - else - nodePtr = m_net.CreateInputNode(name, rows, cols); - } - } - else if (SparseInputValue::TypeName() == cnNodeType) - { - if (parameter.size() < 1 || parameter.size() > 2) - RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str()); - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t rows = ((NDLNode*)params[0])->GetScalar(); - size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; - - // first look for this node already existing in the network - if (m_net.NodeNameExist(name)) - nodePtr = m_net.GetNodeFromName(name); - else - nodePtr = m_net.CreateSparseInputNode(name, rows, cols); - } - } - else if (cnNodeType == L"ImageInput") - { - if (parameter.size() < 3 || parameter.size() > 4) - RuntimeError("%ws should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str()); - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t imageWidth = ((NDLNode*)params[0])->GetScalar(); - size_t imageHeight = ((NDLNode*)params[1])->GetScalar(); - size_t imageChannels = ((NDLNode*)params[2])->GetScalar(); - size_t numImages = parameter.size() > 3 ? ((NDLNode*)params[3])->GetScalar() : 1; - - nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages); - } - } - else if (LearnableParameter::TypeName() == cnNodeType) - { - if (parameter.size() < 1 || parameter.size() > 2) - RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str()); - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t rows = ((NDLNode*)params[0])->GetScalar(); - size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; - - bool needGradient = node->GetOptionalParameter("needGradient", "true"); - - nodePtr = m_net.CreateLearnableParameter(name, rows, cols); - - nodePtr->NeedGradient() = needGradient; - } - else if (pass == ndlPassFinal) - { - static int randomSeed = 1; - std::string initString = node->GetOptionalParameter("init", "uniform"); - ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1"); - ElemType value = node->GetOptionalParameter("value", "0"); - - msra::strfun::tolower_ascii (initString); - if (initString == "fixedvalue") - nodePtr->FunctionValues().SetValue(value); - else if (initString == "uniform") - m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale); - else if (initString == "gaussian") - m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale); - else if (initString == "fromfile") - { - std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", ""); - if (initFromFilePath == "") - RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method"); - if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"') - // remove the opening and closing double quotes - initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2); - if(!fexists(initFromFilePath)) - RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str()); - m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath); - } - else - RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]"); - } - } - else if (SparseLearnableParameter::TypeName() == cnNodeType) - { - if (parameter.size() < 1 || parameter.size() > 2) - RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str()); - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t rows = ((NDLNode*)params[0])->GetScalar(); - size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; - - bool needGradient = node->GetOptionalParameter("needGradient", "true"); - - nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols); - - nodePtr->NeedGradient() = needGradient; - } - else if (pass == ndlPassFinal) - { - static int randomSeed = 1; - std::string initString = node->GetOptionalParameter("init", "uniform"); - ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1"); - ElemType value = node->GetOptionalParameter("value", "0"); - - msra::strfun::tolower_ascii(initString); - if (initString == "fixedvalue") - nodePtr->FunctionValues().SetValue(value); - else if (initString == "uniform") - m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale); - else if (initString == "gaussian") - m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale); - else if (initString == "fromfile") - { - std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", ""); - if (initFromFilePath == "") - RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method"); - if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"') - // remove the opening and closing double quotes - initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2); - if(!fexists(initFromFilePath)) - RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str()); - m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath); - } - else - RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]"); - } - } - else if (cnNodeType == L"Constant") - { - if (parameter.size() != 1) - RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]]."); - - if (pass == ndlPassInitial) - { - size_t rows = node->GetOptionalParameter("rows", "1"); - size_t cols = node->GetOptionalParameter("cols", "1"); - - nodePtr = m_net.CreateLearnableParameter(name, rows, cols); - nodePtr->NeedGradient() = false; - } - else if (pass == ndlPassFinal) - { - ElemType val = parameter[0]->GetScalar(); - nodePtr->FunctionValues().SetValue(val); - } - } - else if (cnNodeType == RowSliceNode::TypeName()) - { - - // setup the parameter position of children so we can hook them up later - nodeParamCount = 1; - // parameters are (rows, [cols], inputNode) - nodeParamStart = parameter.size() > 2?2:1; - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t start_index = ((NDLNode*)params[0])->GetScalar(); - size_t num_rows = ((NDLNode*)params[1])->GetScalar(); - - bool needGradient = node->GetOptionalParameter("needGradient", "false"); - nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name); - nodePtr->NeedGradient() = needGradient; - - } - } - else if (cnNodeType == DelayNode::TypeName()) - { - // setup the parameter position of children so we can hook them up later - nodeParamCount = 1; - // parameters are (rows, [cols], delayNode) - nodeParamStart = parameter.size() > 2?2:1; - - if (pass == ndlPassInitial) - { - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); - size_t rows = ((NDLNode*)params[0])->GetScalar(); - // if we have three parameters the second is columns - size_t cols = parameter.size() > 2 ? ((NDLNode*)params[1])->GetScalar() : 1; - - bool needGradient = node->GetOptionalParameter("needGradient", "false"); - float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1"); - nodePtr = m_net.Delay(NULL, defaultHiddenActivity, rows, cols, name); - size_t delayTime = node->GetOptionalParameter("delayTime","1"); - ((DelayNode*)nodePtr)->SetDelay(delayTime); - - nodePtr->NeedGradient() = needGradient; - } - } - else if (cnNodeType == ConvolutionNode::TypeName()) - { - if (parameter.size() != 7) - RuntimeError("%ws should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str()); - - // setup the parameter position of children so we can hook them up later - nodeParamCount = 2; - nodeParamStart = 0; - - if (pass == ndlPassInitial) - { - int id = 2; // skip weightNode and inputValueNode - - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, id, parameter.size()-id, pass); - id = 0; // reset counter because the params array starts at zero - size_t kernelWidth = ((NDLNode*)params[id++])->GetScalar(); - size_t kernelHeight = ((NDLNode*)params[id++])->GetScalar(); - size_t outputChannels = ((NDLNode*)params[id++])->GetScalar(); - size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); - size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); - - assert (id == 5); - - //optional - bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false"); - size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0"); - - - nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels, - horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples); - } - } - else if (cnNodeType == MaxPoolingNode::TypeName()) - { - if (parameter.size() != 5) - RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str()); - - // setup the parameter position of children so we can hook them up later - nodeParamCount = 1; - nodeParamStart = 0; - - if (pass == ndlPassInitial) - { - int id = 1; // skip inputValueNode - - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass); - id = 0; // reset counter because the params array starts at zero - size_t windowWidth = ((NDLNode*)params[id++])->GetScalar(); - size_t windowHeight = ((NDLNode*)params[id++])->GetScalar(); - size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); - size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); - - assert (id == 4); - - nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, - horizontalSubsample, verticalSubsample, name); - } - } - else if (cnNodeType == AveragePoolingNode::TypeName()) - { - if (parameter.size() != 5) - RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str()); - - // setup the parameter position of children so we can hook them up later - nodeParamCount = 1; - nodeParamStart = 0; - - if (pass == ndlPassInitial) - { - int id = 1; // skip inputValueNode - - // evaluate only scalar parameters - vector params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass); - id = 0; // reset counter because the params array starts at zero - size_t windowWidth = ((NDLNode*)params[id++])->GetScalar(); - size_t windowHeight = ((NDLNode*)params[id++])->GetScalar(); - size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); - size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); - - assert (id == 4); - - nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, - horizontalSubsample, verticalSubsample, name); - } - } - else - { - - // setup the variables for node parameter processing - nodeParamCount = parameter.size(); // all parameters are nodes in standard nodes - nodeParamStart = 0; - - if (pass == ndlPassInitial) - { - nodePtr = m_net.CreateComputationNode(node->GetValue(), name); - } - } - - switch (pass) - { - case ndlPassInitial: - node->SetEvalValue(nodePtr); - // evaluate parameters - EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass); - break; - case ndlPassResolve: - { - std::vector inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass); - - switch (inputs.size()) - { - case 1: - nodePtr->AttachInputs(ComputationNodePtr(inputs[0])); - break; - case 2: - nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1])); - break; - case 3: - nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2])); - break; - case 4: - nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3])); - break; - default: - if (nodeParamCount > 0) - RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str()); - break; - } - - // process common optional parameters (like "tag"); - ProcessOptionalParameters(node); - break; - } - case ndlPassFinal: - break; - } - } - -#ifdef LATER - // EvaluateDotName - Evaluate a dot name and resolve to target node - // node - NDLNode of the script - // nodeParam - NDLNode parameter we are evaluating - // baseName - name of the base node - // pass - which pass through the NDL nodes - // returns: the node that is the evaluated parameter - virtual NDLNode* EvaluateDotName(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseNameP, const NDLPass pass) - - { - if (pass > ndlPassInitial && evaluateNode) - { - std::string name = nodeParam->GetName(); - std::wstring wname = msra::strfun::utf16(name); - if (nodeParam->GetType() == ndlTypeDotParameter) - { - // When we see a variable of the form "A.B" in a macro, we need to resolve it to an actual node, by first constructing it's - // fully-qualified name. There are 2 possibilities: - // 1) "A" was defined locally within the macro. In this case, we must find the fully-qualified name of the node that this macro - // call is being assigned to (eg, "C" in the example "C=Macro(X)"), and concatenate it's name with "A.B" (eg, "C.A.B"). - // 2) "A" was passed in as a parameter to a macro. In this case, we must find the fully-qualified name of the node that - // was passed in as "A", and replace the "A" and "A.B" with this name. - - // Consider the following example: - // NdlBLob=[ - // P=MacroCall1(...) - // C=MacroCall2(P) - // ] - // # MacroDefinition - // MacroCall2(X) - // { - // A=MacroCall3(...) - // D=Times(A.B,X.B)} - // } - // - - // In this example, in the call D=Times(A.B,X.B), we need to resolve A.B and X.B appropriately. - // Specifically, "A.B" must be resolved to the fully qualified name "C.A.B", whereas "X.B" must be resolved to the fully qualified name "P.B". - // We then use this fully-qualified name to look up this node in the model (using "m_net.GetNodeFromName"). - - std::size_t firstDotPos = name.find_first_of("."); - if (firstDotPos == std::string::npos) - { - LogicError("nodeParam of type \"ndlTypeDotParameter\" doesn't have a dot in its name: %s", name.c_str()); - } - - std::string nameBeforeDot = name.substr(0, firstDotPos); - std::string nameAfterDot = name.substr(firstDotPos + 1, name.size() - (firstDotPos + 1)); - - // look up if "nameBeforeDot" was a parameter to the macro. - NDLNode* resolvedParam = nodeParam->GetParentScript()->FindSymbol(nameBeforeDot); - if (resolvedParam != nullptr && resolvedParam->GetType() == ndlTypeMacroCall) - { - // if "nameBeforeDot" was a parameter to the macro, builds it's fully qualified name by - // replacing "nameBeforeDot" with the fully qualified name of the node passed in as the parameter. - NDLScript* parentScript = resolvedParam->GetParentScript(); - baseName = parentScript->GetBaseName(); - std::wstring resolvedParamName = msra::strfun::utf16(resolvedParam->GetName()); - wname = baseName.empty() ? - resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot) : - baseName + L"." + resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot); - } - else if (!baseName.empty()) - { - // else, "nameBeforeDot" wasn't a parameter to the macro, so treat it as a local variable. - wname = baseName + L"." + wname; - } - } - else if (!baseName.empty()) - { - wname = baseName + L"." + wname; - } - - // fully qualified names can be looked up in the model - if (m_net.NodeNameExist(wname)) - { - void* np = (void*)m_net.GetNodeFromName(wname); - nodeParam->SetEvalValue(np); - } - // NOTE: there is a bug here, we allow an abbreviated node reference (i.e. L1.BFF) based on return values in NDL - // when the actual full node reference that the computational network uses would be L1.BFF.FF.P, so that is what CN sees - // can we do the normal find symbol here to allow abbreviated node references? - - // if we still didn't get a value, throw an error - if (nodeParam->GetEvalValue() == nullptr) - { - LogicError("Dot name could not be resolved '%s': should have a node named '%ls' in computational network\n", nodeParam->GetName().c_str(), name.c_str()); - } - } - return nodeParam; - } -#endif - - // EvaluateParameter - Evaluate a parameter of a call - // node - NDLNode of the script - // nodeParam - NDLNode parameter we are evaluating - // baseName - name of the base node - // pass - which pass through the NDL nodes - // returns: the node that is the evaluated parameter - virtual NDLNode* EvaluateParameter(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseNameP, const NDLPass pass ) - { - // get the parent script that includes the symbol table we are interested in - NDLScript* script = node->GetParentScript(); - wstring baseName = baseNameP; - if (script == NULL) - { - std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName()); - LogicError("no script for a parameter node in call to %ls\n", name.c_str()); - } - - // evaluate the parameter if we haven't yet, or if we are in the resolve pass (need to set the inputs) - bool evaluateNode = nodeParam->GetEvalValue() == NULL || pass == ndlPassResolve; - switch (nodeParam->GetType()) - { - // if the node is a parameter then look it up in the symbol table - case ndlTypeUndetermined: // an undetermined parameter needs to be looked up again in the symbol table - case ndlTypeParameter: - { - // lookup the parameter - NDLNode* nodeResolve = script->FindSymbol(nodeParam->GetName()); - - // if we have resolved the name, no need to continue evaluation - if (!(pass == ndlPassResolve && nodeResolve && nodeParam->GetEvalValue() == nullptr)) - { - break; - } - if (pass > ndlPassInitial && evaluateNode && nodeResolve) - { - std::string name = nodeResolve->GetName(); - // we need to start from the parent script, because that is the namespace of the parameter being passed in - NDLScript* parentScript = nodeResolve->GetParentScript(); - nodeResolve = parentScript->FindSymbol(name); - - // if we still didn't get a value - if (nodeResolve == nullptr || nodeResolve->GetEvalValue() == nullptr) - { - // check for the fully quantified name in the computation network - // this is needed for MEL processing, since CN nodes names can be used as parameters in MEL - std::wstring wname = msra::strfun::utf16(name); - if (m_net.NodeNameExist(wname)) - { - void* np = (void*)m_net.GetNodeFromName(wname); - // if we don't have a resolve node, it's because the name didn't exist in NDL - if (!nodeResolve) - nodeResolve = nodeParam; - nodeResolve->SetEvalValue(np); - } - else - { - RuntimeError("Parameter name could not be resolved '%s'\n", name.c_str()); - } - } - } - nodeParam = nodeResolve; - break; - } - case ndlTypeFunction: - if (evaluateNode) - Evaluate(nodeParam, baseName, pass); - break; - case ndlTypeMacroCall: - if (evaluateNode) - nodeParam->EvaluateMacro(*this, baseName, pass); - break; - // constants and variables are good as is - case ndlTypeConstant: - case ndlTypeVariable: - break; - // everything else is illegal as a parameter - default: - { - std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName()); - RuntimeError("Invalid parameter (macro definitions and arrays not allowed), see call to %ls\n", name.c_str()); - } - break; - } - return nodeParam; - } - - // EvaluateParameters - Evaluate the parameters of a call - // node - NDLNode we are evaluating paramters for - // baseName - baseName for the current node - // nodeParamStart - starting parameter that contains a node - // nodeParamCount - ending parameter that contains a node - // pass - NDL pass we are evaluating - // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator - virtual std::vector EvaluateParameters(NDLNode* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) - { - std::vector inputs; - std::vector*> parameter = node->GetParameters(); - ConfigArray paramString = node->GetParamString(); - - if (parameter.size() < 1) - { - return inputs; - } - if (nodeParamStart + nodeParamCount > parameter.size()) - throw logic_error("EvaluateParmeters: nodeParamters specified that do not exist"); - size_t numChildren = nodeParamCount; - for (size_t i=0; i < numChildren;++i) - { - int index = i+nodeParamStart; - NDLNode* nodeParam = parameter[index]; - std::wstring paramS = paramString[index]; - - // default base is same as current - std::wstring baseSymbol = baseName; - - NDLNode* nodeResult = EvaluateParameter(node, nodeParam, baseSymbol, pass); - // look for a prefix here and set baseName appropriately - - if (pass == ndlPassResolve) - { - void* np = nodeResult->GetEvalValue(); - assert(np != nullptr); - inputs.push_back((void*)np); - } - else if (pass == ndlPassInitial) // for initial pass we are only interested in resolved nodes (to get constant values) - { - inputs.push_back((void*)nodeResult); - } - // NOTE: in final pass inputs are always NULL - } - - // now return the vector - return inputs; - } - - // ProcessOptionalParameters - Process the optional parameters of a node - virtual void ProcessOptionalParameters(NDLNode* node) - { - vector*> params = node->GetParameters(true); // get all the optional parameters only - ComputationNode* compNode = (ComputationNode*)node->GetEvalValue(); - std::string empty; - - // loop through all the optional parameters processing them as necessary - for (NDLNode* param : params) - { - // make sure it's a "tag" optional parameter, that's all we process currently - if (_stricmp(param->GetName().c_str(), "tag")) - continue; - - std::string value = param->GetValue(); - if (!_stricmp(value.c_str(), "feature")) - { - SetOutputNode(m_net.FeatureNodes(), compNode); - } - else if (!_stricmp(value.c_str(), "label")) - { - SetOutputNode(m_net.LabelNodes(), compNode); - } - else if (!_stricmp(value.c_str(), "criteria")) - { - SetOutputNode(m_net.FinalCriterionNodes(), compNode); - } - else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters - { - SetOutputNode(m_net.EvaluationNodes(), compNode); - } - else if (!_stricmp(value.c_str(), "output")) - { - SetOutputNode(m_net.OutputNodes(), compNode); - } - } - - } - - // SetOutputNode - Set the output node, checks to see if it already exists first - // nodeGroup - group vector to add to - // compNode - computation node to add - void SetOutputNode(std::vector*>& nodeGroup, ComputationNode* compNode) - { - for (ComputationNodePtr node : nodeGroup) - { - if (node == compNode) - return; - } - nodeGroup.push_back(compNode); - } - - // FindSymbol - Search the nodes for a fully quantified symbol - // symbol - name of the symbol fully quantified name with "dots" - // returns - pointer to the matching EvalValue for that node, of NULL if not found - virtual void* FindSymbol(const wstring& symbol) - { - if (m_net.NodeNameExist(symbol)) - return m_net.GetNodeFromName(symbol); - return NULL; - } - - virtual ~SynchronousNodeEvaluator() - { - } - -private: - ComputationNetwork& m_net; - typedef ComputationNode* ComputationNodePtr; - void operator=(const SynchronousNodeEvaluator&); -}; - -// SynchronousExecutionEngine -// TODO JC Refactor eligible methods and members into abstract base class. -template -class SynchronousExecutionEngine : public IExecutionEngine -{ -public: - SynchronousExecutionEngine(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, unsigned long randomSeedOffset=0) - { - m_computationNetwork = new ComputationNetwork(deviceId); - m_computationNetwork->SetRandomSeedOffset(randomSeedOffset); - m_ownNetwork = true; - m_nodeEvaluator = new SynchronousNodeEvaluator(*m_computationNetwork); - } - - SynchronousExecutionEngine(ComputationNetwork* computationNetwork) - { - m_computationNetwork = computationNetwork; - m_ownNetwork = false; - m_nodeEvaluator = new SynchronousNodeEvaluator(*m_computationNetwork); - } - - virtual ~SynchronousExecutionEngine() - { - if (m_ownNetwork) - delete m_computationNetwork; - delete m_nodeEvaluator; - } - - ComputationNetwork& GetComputationNetwork() - { - return *m_computationNetwork; - } - - NDLNodeEvaluator& GetNodeEvaluator() - { - return *m_nodeEvaluator; - } - -private: - bool m_ownNetwork; - ComputationNetwork* m_computationNetwork; - SynchronousNodeEvaluator* m_nodeEvaluator; -protected: - // Copy constructor, should never be called. - SynchronousExecutionEngine(const SynchronousExecutionEngine& /*deepCopyFrom*/) - { - throw std::logic_error("'SynchronousExecutionEngine(const SynchronousExecutionEngine& deepCopyFrom)' should never be called."); - } - - // Assignment operator, should never be called. - SynchronousExecutionEngine& operator=(const SynchronousExecutionEngine& /*deepCopyFrom*/) - { - throw std::logic_error("'SynchronousExecutionEngine& operator=(const SynchronousExecutionEngine& deepCopyFrom)' should never be called."); - } -}; - -template class SynchronousExecutionEngine; -template class SynchronousExecutionEngine; - +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// + +#pragma once + +#include "IExecutionEngine.h" +#include "ComputationNetwork.h" +#include "fileutil.h" // for fexists() + +namespace Microsoft { namespace MSR { namespace CNTK { + +// SynchronousNodeEvaluator +// Process the Network Description Language into a Computation Network useable +// by SynchronousExecutionEngine. +template +class SynchronousNodeEvaluator : public NDLNodeEvaluator +{ +public: + // Constructor - create evaluator + SynchronousNodeEvaluator(ComputationNetwork& cn) : m_net(cn) + { } + + // Evaluate - evaluate a node and translate into underlying + // node - node we are evaluating + // baseName - base name for all symbols at this level + // pass - NDLPass through the evaluation (0-initial, 1-resolve variables, 2-final) + virtual void Evaluate(NDLNode* node, const wstring& baseName, const NDLPass pass) + { + // constants don't need to be evaluated, they just translate into numbers... + if (node->GetType() == ndlTypeConstant + || node->GetType() == ndlTypeArray) + return; + + // setup the node parameters, where they start in the parameter list, and how many there are + // this is needed for the ndlPassResolve step to hookup all the inputs + int nodeParamStart = 0; + int nodeParamCount = 0; + + // get the parameters + std::vector*> parameter = node->GetParameters(); + + // get the name for the symbol to be used by CN nodes + std::wstring name = msra::strfun::utf16(node->GetName()); + if (!baseName.empty()) + { + name = baseName + L"." + name; + } + + std::wstring cnNodeType = msra::strfun::utf16(node->GetValue()); + + ComputationNodePtr nodePtr = nullptr; + + // get the node pointer for the node, should be stored in the EvalValue; + if (pass > ndlPassInitial) + { + nodePtr = (ComputationNodePtr)node->GetEvalValue(); + if (nodePtr == nullptr) + { + nodePtr = (ComputationNodePtr)m_net.GetNodeFromName(name); + node->SetEvalValue(nodePtr); + } + } + + if (InputValue::TypeName() == cnNodeType) + { + if (parameter.size() < 1 || parameter.size() > 2) + RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str()); + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t rows = ((NDLNode*)params[0])->GetScalar(); + size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; + + // first look for this node already existing in the network + if (m_net.NodeNameExist(name)) + nodePtr = m_net.GetNodeFromName(name); + else + nodePtr = m_net.CreateInputNode(name, rows, cols); + } + } + else if (SparseInputValue::TypeName() == cnNodeType) + { + if (parameter.size() < 1 || parameter.size() > 2) + RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str()); + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t rows = ((NDLNode*)params[0])->GetScalar(); + size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; + + // first look for this node already existing in the network + if (m_net.NodeNameExist(name)) + nodePtr = m_net.GetNodeFromName(name); + else + nodePtr = m_net.CreateSparseInputNode(name, rows, cols); + } + } + else if (cnNodeType == L"ImageInput") + { + if (parameter.size() < 3 || parameter.size() > 4) + RuntimeError("%ws should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str()); + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t imageWidth = ((NDLNode*)params[0])->GetScalar(); + size_t imageHeight = ((NDLNode*)params[1])->GetScalar(); + size_t imageChannels = ((NDLNode*)params[2])->GetScalar(); + size_t numImages = parameter.size() > 3 ? ((NDLNode*)params[3])->GetScalar() : 1; + + nodePtr = m_net.CreateInputNode(name, imageWidth, imageHeight, imageChannels, numImages); + } + } + else if (LearnableParameter::TypeName() == cnNodeType) + { + if (parameter.size() < 1 || parameter.size() > 2) + RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str()); + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t rows = ((NDLNode*)params[0])->GetScalar(); + size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; + + bool needGradient = node->GetOptionalParameter("needGradient", "true"); + + nodePtr = m_net.CreateLearnableParameter(name, rows, cols); + + nodePtr->NeedGradient() = needGradient; + } + else if (pass == ndlPassFinal) + { + static int randomSeed = 1; + std::string initString = node->GetOptionalParameter("init", "uniform"); + ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1"); + ElemType value = node->GetOptionalParameter("value", "0"); + + msra::strfun::tolower_ascii (initString); + if (initString == "fixedvalue") + nodePtr->FunctionValues().SetValue(value); + else if (initString == "uniform") + m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale); + else if (initString == "gaussian") + m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale); + else if (initString == "fromfile") + { + std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", ""); + if (initFromFilePath == "") + RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method"); + if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"') + // remove the opening and closing double quotes + initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2); + if(!fexists(initFromFilePath)) + RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str()); + m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath); + } + else + RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]"); + } + } + else if (SparseLearnableParameter::TypeName() == cnNodeType) + { + if (parameter.size() < 1 || parameter.size() > 2) + RuntimeError("%ws should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str()); + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t rows = ((NDLNode*)params[0])->GetScalar(); + size_t cols = params.size() > 1 ? ((NDLNode*)params[1])->GetScalar() : 1; + + bool needGradient = node->GetOptionalParameter("needGradient", "true"); + + nodePtr = m_net.CreateSparseLearnableParameter(name, rows, cols); + + nodePtr->NeedGradient() = needGradient; + } + else if (pass == ndlPassFinal) + { + static int randomSeed = 1; + std::string initString = node->GetOptionalParameter("init", "uniform"); + ElemType initValueScale = node->GetOptionalParameter("initValueScale", "1"); + ElemType value = node->GetOptionalParameter("value", "0"); + + msra::strfun::tolower_ascii(initString); + if (initString == "fixedvalue") + nodePtr->FunctionValues().SetValue(value); + else if (initString == "uniform") + m_net.InitLearnableParameters(nodePtr, true, randomSeed++, initValueScale); + else if (initString == "gaussian") + m_net.InitLearnableParameters(nodePtr, false, randomSeed++, initValueScale); + else if (initString == "fromfile") + { + std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", ""); + if (initFromFilePath == "") + RuntimeError("initFromFilePath must be set when using \"fromFile\" initialization method"); + if(initFromFilePath[0] == '\"' && initFromFilePath[initFromFilePath.size()-1] == '\"') + // remove the opening and closing double quotes + initFromFilePath = initFromFilePath.substr(1, initFromFilePath.size()-2); + if(!fexists(initFromFilePath)) + RuntimeError("File pointed to by initFromFilePath does not exist: %s", initFromFilePath.c_str()); + m_net.InitLearnableParametersFromFile(nodePtr, initFromFilePath); + } + else + RuntimeError("init must be one of the values of [uniform|gaussian|fixedvalue]"); + } + } + else if (cnNodeType == L"Constant") + { + if (parameter.size() != 1) + RuntimeError("Constant should have 1 fixed parameter [val] and two optional parameters [rows=[1|yourvalue], cols=[1|yourvalue]]."); + + if (pass == ndlPassInitial) + { + size_t rows = node->GetOptionalParameter("rows", "1"); + size_t cols = node->GetOptionalParameter("cols", "1"); + + nodePtr = m_net.CreateLearnableParameter(name, rows, cols); + nodePtr->NeedGradient() = false; + } + else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0) + { + ElemType val = parameter[0]->GetScalar(); + nodePtr->FunctionValues().SetValue(val); + } + } + else if (cnNodeType == RowSliceNode::TypeName()) + { + + // setup the parameter position of children so we can hook them up later + nodeParamCount = 1; + // parameters are (rows, [cols], inputNode) + nodeParamStart = parameter.size() > 2?2:1; + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t start_index = ((NDLNode*)params[0])->GetScalar(); + size_t num_rows = ((NDLNode*)params[1])->GetScalar(); + + bool needGradient = node->GetOptionalParameter("needGradient", "false"); + nodePtr = m_net.RowSlice(NULL, start_index, num_rows, name); + nodePtr->NeedGradient() = needGradient; + + } + } + else if (cnNodeType == DelayNode::TypeName()) + { + // setup the parameter position of children so we can hook them up later + nodeParamCount = 1; + // parameters are (rows, [cols], delayNode) + nodeParamStart = parameter.size() > 2?2:1; + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + size_t rows = ((NDLNode*)params[0])->GetScalar(); + // if we have three parameters the second is columns + size_t cols = parameter.size() > 2 ? ((NDLNode*)params[1])->GetScalar() : 1; + + bool needGradient = node->GetOptionalParameter("needGradient", "false"); + float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1"); + nodePtr = m_net.Delay(NULL, defaultHiddenActivity, rows, cols, name); + size_t delayTime = node->GetOptionalParameter("delayTime","1"); + ((DelayNode*)nodePtr)->SetDelay(delayTime); + + nodePtr->NeedGradient() = needGradient; + } + } + else if (cnNodeType == ConvolutionNode::TypeName()) + { + if (parameter.size() != 7) + RuntimeError("%ws should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue]].", cnNodeType.c_str()); + + // setup the parameter position of children so we can hook them up later + nodeParamCount = 2; + nodeParamStart = 0; + + if (pass == ndlPassInitial) + { + int id = 2; // skip weightNode and inputValueNode + + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, id, parameter.size()-id, pass); + id = 0; // reset counter because the params array starts at zero + size_t kernelWidth = ((NDLNode*)params[id++])->GetScalar(); + size_t kernelHeight = ((NDLNode*)params[id++])->GetScalar(); + size_t outputChannels = ((NDLNode*)params[id++])->GetScalar(); + size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); + size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); + + assert (id == 5); + + //optional + bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false"); + size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0"); + + + nodePtr = m_net.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels, + horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples); + } + } + else if (cnNodeType == MaxPoolingNode::TypeName()) + { + if (parameter.size() != 5) + RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str()); + + // setup the parameter position of children so we can hook them up later + nodeParamCount = 1; + nodeParamStart = 0; + + if (pass == ndlPassInitial) + { + int id = 1; // skip inputValueNode + + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass); + id = 0; // reset counter because the params array starts at zero + size_t windowWidth = ((NDLNode*)params[id++])->GetScalar(); + size_t windowHeight = ((NDLNode*)params[id++])->GetScalar(); + size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); + size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); + + assert (id == 4); + + nodePtr = m_net.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, + horizontalSubsample, verticalSubsample, name); + } + } + else if (cnNodeType == AveragePoolingNode::TypeName()) + { + if (parameter.size() != 5) + RuntimeError("%ws should have 5 parameters[inputValueNodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample].", cnNodeType.c_str()); + + // setup the parameter position of children so we can hook them up later + nodeParamCount = 1; + nodeParamStart = 0; + + if (pass == ndlPassInitial) + { + int id = 1; // skip inputValueNode + + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass); + id = 0; // reset counter because the params array starts at zero + size_t windowWidth = ((NDLNode*)params[id++])->GetScalar(); + size_t windowHeight = ((NDLNode*)params[id++])->GetScalar(); + size_t horizontalSubsample = ((NDLNode*)params[id++])->GetScalar(); + size_t verticalSubsample = ((NDLNode*)params[id++])->GetScalar(); + + assert (id == 4); + + nodePtr = m_net.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight, + horizontalSubsample, verticalSubsample, name); + } + } + else + { + + // setup the variables for node parameter processing + nodeParamCount = parameter.size(); // all parameters are nodes in standard nodes + nodeParamStart = 0; + + if (pass == ndlPassInitial) + { + nodePtr = m_net.CreateComputationNode(node->GetValue(), name); + } + } + + switch (pass) + { + case ndlPassInitial: + node->SetEvalValue(nodePtr); + // evaluate parameters + EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass); + break; + case ndlPassResolve: + { + std::vector inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass); + + switch (inputs.size()) + { + case 1: + nodePtr->AttachInputs(ComputationNodePtr(inputs[0])); + break; + case 2: + nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1])); + break; + case 3: + nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2])); + break; + case 4: + nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3])); + break; + default: + if (nodeParamCount > 0) + RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str()); + break; + } + + // process common optional parameters (like "tag"); + ProcessOptionalParameters(node); + break; + } + case ndlPassFinal: + break; + } + } + +#ifdef LATER + // EvaluateDotName - Evaluate a dot name and resolve to target node + // node - NDLNode of the script + // nodeParam - NDLNode parameter we are evaluating + // baseName - name of the base node + // pass - which pass through the NDL nodes + // returns: the node that is the evaluated parameter + virtual NDLNode* EvaluateDotName(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseNameP, const NDLPass pass) + + { + if (pass > ndlPassInitial && evaluateNode) + { + std::string name = nodeParam->GetName(); + std::wstring wname = msra::strfun::utf16(name); + if (nodeParam->GetType() == ndlTypeDotParameter) + { + // When we see a variable of the form "A.B" in a macro, we need to resolve it to an actual node, by first constructing it's + // fully-qualified name. There are 2 possibilities: + // 1) "A" was defined locally within the macro. In this case, we must find the fully-qualified name of the node that this macro + // call is being assigned to (eg, "C" in the example "C=Macro(X)"), and concatenate it's name with "A.B" (eg, "C.A.B"). + // 2) "A" was passed in as a parameter to a macro. In this case, we must find the fully-qualified name of the node that + // was passed in as "A", and replace the "A" and "A.B" with this name. + + // Consider the following example: + // NdlBLob=[ + // P=MacroCall1(...) + // C=MacroCall2(P) + // ] + // # MacroDefinition + // MacroCall2(X) + // { + // A=MacroCall3(...) + // D=Times(A.B,X.B)} + // } + // + + // In this example, in the call D=Times(A.B,X.B), we need to resolve A.B and X.B appropriately. + // Specifically, "A.B" must be resolved to the fully qualified name "C.A.B", whereas "X.B" must be resolved to the fully qualified name "P.B". + // We then use this fully-qualified name to look up this node in the model (using "m_net.GetNodeFromName"). + + std::size_t firstDotPos = name.find_first_of("."); + if (firstDotPos == std::string::npos) + { + LogicError("nodeParam of type \"ndlTypeDotParameter\" doesn't have a dot in its name: %s", name.c_str()); + } + + std::string nameBeforeDot = name.substr(0, firstDotPos); + std::string nameAfterDot = name.substr(firstDotPos + 1, name.size() - (firstDotPos + 1)); + + // look up if "nameBeforeDot" was a parameter to the macro. + NDLNode* resolvedParam = nodeParam->GetParentScript()->FindSymbol(nameBeforeDot); + if (resolvedParam != nullptr && resolvedParam->GetType() == ndlTypeMacroCall) + { + // if "nameBeforeDot" was a parameter to the macro, builds it's fully qualified name by + // replacing "nameBeforeDot" with the fully qualified name of the node passed in as the parameter. + NDLScript* parentScript = resolvedParam->GetParentScript(); + baseName = parentScript->GetBaseName(); + std::wstring resolvedParamName = msra::strfun::utf16(resolvedParam->GetName()); + wname = baseName.empty() ? + resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot) : + baseName + L"." + resolvedParamName + L"." + msra::strfun::utf16(nameAfterDot); + } + else if (!baseName.empty()) + { + // else, "nameBeforeDot" wasn't a parameter to the macro, so treat it as a local variable. + wname = baseName + L"." + wname; + } + } + else if (!baseName.empty()) + { + wname = baseName + L"." + wname; + } + + // fully qualified names can be looked up in the model + if (m_net.NodeNameExist(wname)) + { + void* np = (void*)m_net.GetNodeFromName(wname); + nodeParam->SetEvalValue(np); + } + // NOTE: there is a bug here, we allow an abbreviated node reference (i.e. L1.BFF) based on return values in NDL + // when the actual full node reference that the computational network uses would be L1.BFF.FF.P, so that is what CN sees + // can we do the normal find symbol here to allow abbreviated node references? + + // if we still didn't get a value, throw an error + if (nodeParam->GetEvalValue() == nullptr) + { + LogicError("Dot name could not be resolved '%s': should have a node named '%ls' in computational network\n", nodeParam->GetName().c_str(), name.c_str()); + } + } + return nodeParam; + } +#endif + + // EvaluateParameter - Evaluate a parameter of a call + // node - NDLNode of the script + // nodeParam - NDLNode parameter we are evaluating + // baseName - name of the base node + // pass - which pass through the NDL nodes + // returns: the node that is the evaluated parameter + virtual NDLNode* EvaluateParameter(NDLNode* node, NDLNode* nodeParam, const std::wstring& baseNameP, const NDLPass pass ) + { + // get the parent script that includes the symbol table we are interested in + NDLScript* script = node->GetParentScript(); + wstring baseName = baseNameP; + if (script == NULL) + { + std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName()); + LogicError("no script for a parameter node in call to %ls\n", name.c_str()); + } + + // evaluate the parameter if we haven't yet, or if we are in the resolve pass (need to set the inputs) + bool evaluateNode = nodeParam->GetEvalValue() == NULL || pass == ndlPassResolve; + switch (nodeParam->GetType()) + { + // if the node is a parameter then look it up in the symbol table + case ndlTypeUndetermined: // an undetermined parameter needs to be looked up again in the symbol table + case ndlTypeParameter: + { + // lookup the parameter + NDLNode* nodeResolve = script->FindSymbol(nodeParam->GetName()); + + // if we have resolved the name, no need to continue evaluation + if (!(pass == ndlPassResolve && nodeResolve && nodeParam->GetEvalValue() == nullptr)) + { + break; + } + if (pass > ndlPassInitial && evaluateNode && nodeResolve) + { + std::string name = nodeResolve->GetName(); + // we need to start from the parent script, because that is the namespace of the parameter being passed in + NDLScript* parentScript = nodeResolve->GetParentScript(); + nodeResolve = parentScript->FindSymbol(name); + + // if we still didn't get a value + if (nodeResolve == nullptr || nodeResolve->GetEvalValue() == nullptr) + { + // check for the fully quantified name in the computation network + // this is needed for MEL processing, since CN nodes names can be used as parameters in MEL + std::wstring wname = msra::strfun::utf16(name); + if (m_net.NodeNameExist(wname)) + { + void* np = (void*)m_net.GetNodeFromName(wname); + // if we don't have a resolve node, it's because the name didn't exist in NDL + if (!nodeResolve) + nodeResolve = nodeParam; + nodeResolve->SetEvalValue(np); + } + else + { + RuntimeError("Parameter name could not be resolved '%s'\n", name.c_str()); + } + } + } + nodeParam = nodeResolve; + break; + } + case ndlTypeFunction: + if (evaluateNode) + Evaluate(nodeParam, baseName, pass); + break; + case ndlTypeMacroCall: + if (evaluateNode) + nodeParam->EvaluateMacro(*this, baseName, pass); + break; + // constants and variables are good as is + case ndlTypeConstant: + case ndlTypeVariable: + break; + // everything else is illegal as a parameter + default: + { + std::wstring name = baseName + L"." + msra::strfun::utf16(node->GetName()); + RuntimeError("Invalid parameter (macro definitions and arrays not allowed), see call to %ls\n", name.c_str()); + } + break; + } + return nodeParam; + } + + // EvaluateParameters - Evaluate the parameters of a call + // node - NDLNode we are evaluating paramters for + // baseName - baseName for the current node + // nodeParamStart - starting parameter that contains a node + // nodeParamCount - ending parameter that contains a node + // pass - NDL pass we are evaluating + // returns: vector of eval pointers, which are ComputationNodePtr for CNEvaluator + virtual std::vector EvaluateParameters(NDLNode* node, const wstring& baseName, int nodeParamStart, int nodeParamCount, const NDLPass pass) + { + std::vector inputs; + std::vector*> parameter = node->GetParameters(); + ConfigArray paramString = node->GetParamString(); + + if (parameter.size() < 1) + { + return inputs; + } + if (nodeParamStart + nodeParamCount > parameter.size()) + throw logic_error("EvaluateParmeters: nodeParamters specified that do not exist"); + size_t numChildren = nodeParamCount; + for (size_t i=0; i < numChildren;++i) + { + int index = i+nodeParamStart; + NDLNode* nodeParam = parameter[index]; + std::wstring paramS = paramString[index]; + + // default base is same as current + std::wstring baseSymbol = baseName; + + NDLNode* nodeResult = EvaluateParameter(node, nodeParam, baseSymbol, pass); + // look for a prefix here and set baseName appropriately + + if (pass == ndlPassResolve) + { + void* np = nodeResult->GetEvalValue(); + assert(np != nullptr); + inputs.push_back((void*)np); + } + else if (pass == ndlPassInitial) // for initial pass we are only interested in resolved nodes (to get constant values) + { + inputs.push_back((void*)nodeResult); + } + // NOTE: in final pass inputs are always NULL + } + + // now return the vector + return inputs; + } + + // ProcessOptionalParameters - Process the optional parameters of a node + virtual void ProcessOptionalParameters(NDLNode* node) + { + vector*> params = node->GetParameters(true); // get all the optional parameters only + ComputationNode* compNode = (ComputationNode*)node->GetEvalValue(); + std::string empty; + + // loop through all the optional parameters processing them as necessary + for (NDLNode* param : params) + { + // make sure it's a "tag" optional parameter, that's all we process currently + if (_stricmp(param->GetName().c_str(), "tag")) + continue; + + std::string value = param->GetValue(); + if (!_stricmp(value.c_str(), "feature")) + { + SetOutputNode(m_net.FeatureNodes(), compNode); + } + else if (!_stricmp(value.c_str(), "label")) + { + SetOutputNode(m_net.LabelNodes(), compNode); + } + else if (!_stricmp(value.c_str(), "criteria")) + { + SetOutputNode(m_net.FinalCriterionNodes(), compNode); + } + else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters + { + SetOutputNode(m_net.EvaluationNodes(), compNode); + } + else if (!_stricmp(value.c_str(), "output")) + { + SetOutputNode(m_net.OutputNodes(), compNode); + } + } + + } + + // SetOutputNode - Set the output node, checks to see if it already exists first + // nodeGroup - group vector to add to + // compNode - computation node to add + void SetOutputNode(std::vector*>& nodeGroup, ComputationNode* compNode) + { + for (ComputationNodePtr node : nodeGroup) + { + if (node == compNode) + return; + } + nodeGroup.push_back(compNode); + } + + // FindSymbol - Search the nodes for a fully quantified symbol + // symbol - name of the symbol fully quantified name with "dots" + // returns - pointer to the matching EvalValue for that node, of NULL if not found + virtual void* FindSymbol(const wstring& symbol) + { + if (m_net.NodeNameExist(symbol)) + return m_net.GetNodeFromName(symbol); + return NULL; + } + + virtual ~SynchronousNodeEvaluator() + { + } + +private: + ComputationNetwork& m_net; + typedef ComputationNode* ComputationNodePtr; + void operator=(const SynchronousNodeEvaluator&); +}; + +// SynchronousExecutionEngine +// TODO JC Refactor eligible methods and members into abstract base class. +template +class SynchronousExecutionEngine : public IExecutionEngine +{ +public: + SynchronousExecutionEngine(DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, unsigned long randomSeedOffset=0) + { + m_computationNetwork = new ComputationNetwork(deviceId); + m_computationNetwork->SetRandomSeedOffset(randomSeedOffset); + m_ownNetwork = true; + m_nodeEvaluator = new SynchronousNodeEvaluator(*m_computationNetwork); + } + + SynchronousExecutionEngine(ComputationNetwork* computationNetwork) + { + m_computationNetwork = computationNetwork; + m_ownNetwork = false; + m_nodeEvaluator = new SynchronousNodeEvaluator(*m_computationNetwork); + } + + virtual ~SynchronousExecutionEngine() + { + if (m_ownNetwork) + delete m_computationNetwork; + delete m_nodeEvaluator; + } + + ComputationNetwork& GetComputationNetwork() + { + return *m_computationNetwork; + } + + NDLNodeEvaluator& GetNodeEvaluator() + { + return *m_nodeEvaluator; + } + +private: + bool m_ownNetwork; + ComputationNetwork* m_computationNetwork; + SynchronousNodeEvaluator* m_nodeEvaluator; +protected: + // Copy constructor, should never be called. + SynchronousExecutionEngine(const SynchronousExecutionEngine& /*deepCopyFrom*/) + { + throw std::logic_error("'SynchronousExecutionEngine(const SynchronousExecutionEngine& deepCopyFrom)' should never be called."); + } + + // Assignment operator, should never be called. + SynchronousExecutionEngine& operator=(const SynchronousExecutionEngine& /*deepCopyFrom*/) + { + throw std::logic_error("'SynchronousExecutionEngine& operator=(const SynchronousExecutionEngine& deepCopyFrom)' should never be called."); + } +}; + +template class SynchronousExecutionEngine; +template class SynchronousExecutionEngine; + }}} \ No newline at end of file diff --git a/MachineLearning/cn/TrainingCriterionNode.h b/MachineLearning/cn/TrainingCriterionNode.h index 16709ec71..c962870a3 100644 --- a/MachineLearning/cn/TrainingCriterionNode.h +++ b/MachineLearning/cn/TrainingCriterionNode.h @@ -1,1245 +1,1245 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#pragma once - -#include -#include -#include -#include -#include -#include -#include "ComputationNode.h" - -namespace Microsoft { namespace MSR { namespace CNTK { - //note: to save computation the gradient may be scaled by an constant. - - template - class SquareErrorNode : public ComputationNode - { - UsingComputationNodeMembers; - public: - SquareErrorNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_leftMinusRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - SquareErrorNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_leftMinusRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"SquareError";} - - virtual void ComputeInputPartial(const size_t inputIndex) - { - if (inputIndex > 1) - throw std::invalid_argument("SquareError criteria only takes two inputs."); - - //left Node must be a scalar - if (inputIndex == 0) //left derivative - { - ComputeInputPartialLeft(Inputs(0)->GradientValues(), GradientValues(), m_leftMinusRight); - } - else - { - ComputeInputPartialRight(Inputs(1)->GradientValues(), GradientValues(), m_leftMinusRight); - } - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("SquareError node should never be in a loop."); - } - - static void WINAPI ComputeInputPartialLeft(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& leftMinusRight) - { - inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), leftMinusRight); - } - - static void WINAPI ComputeInputPartialRight(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& leftMinusRight) - { - inputGradientValues.AddWithScaleOf(-gradientValues.Get00Element(), leftMinusRight); - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->GradientParam(); - descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsInput); - descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsOutput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - virtual void EvaluateThisNode() - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_leftMinusRight); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("SquareError node should never be in a loop."); - } - - static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, Matrix& leftMinusRight) - { - leftMinusRight.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1); - ElemType v = leftMinusRight.FrobeniusNorm(); - functionValues.Resize(1,1); - functionValues.SetValue(v*v/2); -#if NANCHECK - functionValues.HasNan("SquareError"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 2) - throw std::logic_error("SquareError operation requires two inputs."); - - size_t index = 0; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - index = 1; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) - throw std::logic_error("SquareError operation: one of the operants has 0 element."); - - if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size - Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) - { - throw std::logic_error("The Matrix dimension in the SquareError operation does not match."); - } - - FunctionValues().Resize(1,1); - m_leftMinusRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); - CopyImageSizeFromInputs(); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode) - { - m_children.resize(2); - m_children[0] = leftNode; - m_children[1] = rightNode; - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_leftMinusRight.GetDeviceId() != deviceId) - m_leftMinusRight.TransferFromDeviceToDevice(m_leftMinusRight.GetDeviceId(), deviceId,true); - } - } - - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const - { - ComputationNode::CopyTo(nodeP, newName, flags); - SquareErrorNode* node = (SquareErrorNode*) nodeP; - - if (flags & CopyNodeFlags::copyNodeValue) - { - node->m_leftMinusRight = m_leftMinusRight; - } - } - - // copy constructor - SquareErrorNode(const SquareErrorNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_leftMinusRight(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new SquareErrorNode(this, name, flags); - return node; - } - - private: - Matrix m_leftMinusRight; - }; - - template class SquareErrorNode; - template class SquareErrorNode; - - //calculates: -sum(left_i * log(softmax_i(right))) - template - class CrossEntropyWithSoftmaxNode : public ComputationNode - { - UsingComputationNodeMembers; - public: - CrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - CrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"CrossEntropyWithSoftmax";} - - virtual void ComputeInputPartial(const size_t inputIndex) - { - if (inputIndex > 1) - throw std::invalid_argument("CrossEntropyWithSoftmaxNode criterion only takes two inputs."); - - //left Node must be a scalar - if (inputIndex == 0) //left derivative - { - ComputeInputPartialLeft(m_logSoftmaxOfRight, Inputs(inputIndex)->GradientValues(), GradientValues()); - } - else - { - ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues()); - } - - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop."); - } - - static void WINAPI ComputeInputPartialLeft(const Matrix& logSoftmaxOfRight, Matrix& inputGradientValues, - const Matrix& gradientValues) - { -#if DUMPOUTPUT - logSoftmaxOfRight.Print("CrossEntropyWithSoftmax Partial-logSoftmaxOfRight"); - gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues"); - inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-in"); -#endif - - Matrix::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues); -#if DUMPOUTPUT - inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-out"); -#endif - - } - - static void WINAPI ComputeInputPartialRight(const Matrix& softmaxOfRight, const Matrix& inputFunctionValues, - Matrix& inputGradientValues, const Matrix& gradientValues) - { -#if DUMPOUTPUT - softmaxOfRight.Print("CrossEntropyWithSoftmax Partial-softmaxOfRight"); - inputFunctionValues.Print("CrossEntropyWithSoftmax Partial-inputFunctionValues"); - gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues"); - inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right-in"); -#endif - - Matrix::AddScaledDifference(gradientValues, softmaxOfRight, inputFunctionValues, inputGradientValues); -#if DUMPOUTPUT - inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right"); -#endif - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - if (inputIndex == 0) - { - descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsInput); - } - else - { - descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsInput); - descriptor->FunctionParam(0, paramOptionsInput); - } - descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->GradientParam(); - descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsOutput); - descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsOutput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - virtual void EvaluateThisNode() //-sum(left_i * log(softmax_i(right))) - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_softmaxOfRight, m_logSoftmaxOfRight); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop."); - } - - static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, - Matrix& softmaxOfRight, Matrix& logSoftmaxOfRight) - { - logSoftmaxOfRight.AssignLogSoftmaxOf(inputFunctionValues1, true); - softmaxOfRight.SetValue(logSoftmaxOfRight); - softmaxOfRight.InplaceExp(); - functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logSoftmaxOfRight); - functionValues*=(-1); -#if NANCHECK - functionValues.HasNan("CrossEntropyWithSoftmax"); -#endif -#if DUMPOUTPUT - functionValues.Print("CrossEntropyWithSoftmaxNode"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 2) - throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires two inputs."); - - if (Inputs(0)->OperationName() != L"InputValue" && Inputs(0)->OperationName() != L"SparseInputValue") - throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires the first input to be the label."); - - //we may release the constraint that the first operant is an inputValue later so the following code should be kept - size_t index = 0; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - index = 1; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) - throw std::logic_error("CrossEntropyWithSoftmaxNode operation: one of the operants has 0 element."); - - if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size - Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) - { - throw std::logic_error("The Matrix dimension in the CrossEntropyWithSoftmaxNode operation does not match."); - } - - FunctionValues().Resize(1,1); - CopyImageSizeFromInputs(); - - m_logSoftmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); - m_softmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - //leftNode should be the empirical - virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) - { - m_children.resize(2); - m_children[0] = label; - m_children[1] = prediction; - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_logSoftmaxOfRight.GetDeviceId() != deviceId) - { - m_logSoftmaxOfRight.TransferFromDeviceToDevice(m_logSoftmaxOfRight.GetDeviceId(), deviceId,true); - } - if (m_softmaxOfRight.GetDeviceId() != deviceId) - { - m_softmaxOfRight.TransferFromDeviceToDevice(m_softmaxOfRight.GetDeviceId(), deviceId,true); - } - } - } - - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const - { - ComputationNode::CopyTo(nodeP, newName, flags); - CrossEntropyWithSoftmaxNode* node = (CrossEntropyWithSoftmaxNode*) nodeP; - - if (flags & CopyNodeFlags::copyNodeValue) - { - node->m_logSoftmaxOfRight = m_logSoftmaxOfRight; - node->m_softmaxOfRight = m_softmaxOfRight; - } - } - - // copy constructor - CrossEntropyWithSoftmaxNode(const CrossEntropyWithSoftmaxNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_logSoftmaxOfRight(node->m_deviceId), m_softmaxOfRight(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new CrossEntropyWithSoftmaxNode(this, name, flags); - return node; - } - - protected: - Matrix m_logSoftmaxOfRight; - Matrix m_softmaxOfRight; - }; - - template class CrossEntropyWithSoftmaxNode; - template class CrossEntropyWithSoftmaxNode; - - //calculates: -sum(left_i * log(right_i)) - //assume softmax is already done - template - class CrossEntropyNode : public ComputationNode - { - UsingComputationNodeMembers; - public: - CrossEntropyNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - CrossEntropyNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"CrossEntropy";} - - virtual void ComputeInputPartial(const size_t inputIndex) - { - if (inputIndex > 1) - throw std::invalid_argument("CrossEntropy criterion only takes two inputs."); - - //left Node must be a scalar - if (inputIndex == 0) //left derivative - { - ComputeInputPartialLeft(m_logOfRight, Inputs(inputIndex)->GradientValues(), GradientValues()); - } - else - { - ComputeInputPartialRight(m_leftDivRight, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues()); - } - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("CrossEntropy node should never be in a loop."); - } - - static void WINAPI ComputeInputPartialLeft(const Matrix& logOfRight, Matrix& inputGradientValues, - const Matrix& gradientValues) - { - Matrix::ScaleAndAdd(-gradientValues.Get00Element(), logOfRight, inputGradientValues); - } - - static void WINAPI ComputeInputPartialRight(Matrix& leftDivRight, - const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, - Matrix& inputGradientValues, const Matrix& gradientValues) - { - leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1); - - Matrix::ScaleAndAdd(-gradientValues.Get00Element(), leftDivRight, inputGradientValues); - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - if (inputIndex == 0) - { - descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsInput); - } - else - { - descriptor->MatrixParam(m_leftDivRight, "leftDivRight", paramOptionsInput | paramOptionsTemporary); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - } - descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->GradientParam(); - descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsOutput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - - virtual void EvaluateThisNode() //-sum(left_i * log(right_i)) - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_logOfRight); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("CrossEntropy node should never be in a loop."); - } - - static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, - Matrix& logOfRight) - { - logOfRight.SetValue(inputFunctionValues1); - logOfRight.InplaceLog(); - functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logOfRight); - functionValues*=(-1); -#if NANCHECK - functionValues.HasNan("CrossEntropy"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 2) - throw std::logic_error("CrossEntropyNode criterion requires two inputs."); - - if (Inputs(0)->OperationName() != L"InputValue") - throw std::logic_error("CrossEntropyNode criterion requires the first input to be the label."); - - //we may release the constraint that the first operant is an inputValue later so the following code should be kept - size_t index = 0; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - index = 1; - if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) - { - size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); - size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); - Inputs(index)->FunctionValues().Resize(rows, cols); - } - - if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) - throw std::logic_error("CrossEntropyNode operation: one of the operants has 0 element."); - - if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size - Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) - { - throw std::logic_error("The Matrix dimension in the CrossEntropyNode operation does not match."); - } - - FunctionValues().Resize(1,1); - m_logOfRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols()); - m_leftDivRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols()); - CopyImageSizeFromInputs(); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - //leftNode should be the empirical - virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) - { - m_children.resize(2); - m_children[0] = label; - m_children[1] = prediction; - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_logOfRight.GetDeviceId() != deviceId) - { - m_logOfRight.TransferFromDeviceToDevice(m_logOfRight.GetDeviceId(), deviceId,true); - } - if (m_leftDivRight.GetDeviceId() != deviceId) - { - m_leftDivRight.TransferFromDeviceToDevice(m_leftDivRight.GetDeviceId(), deviceId,true); - } - } - } - - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const - { - ComputationNode::CopyTo(nodeP, newName, flags); - CrossEntropyNode* node = (CrossEntropyNode*) nodeP; - - if (flags & CopyNodeFlags::copyNodeValue) - { - node->m_logOfRight = m_logOfRight; - node->m_leftDivRight = m_leftDivRight; - } - } - - // copy constructor - CrossEntropyNode(const CrossEntropyNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_logOfRight(node->m_deviceId), m_leftDivRight(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new CrossEntropyNode(this, name, flags); - return node; - } - - private: - // matrix value passed from evaluate to computePartial - Matrix m_logOfRight; - // temporary - Matrix m_leftDivRight; - }; - - template class CrossEntropyNode; - template class CrossEntropyNode; - - template - class MatrixL1RegNode : public ComputationNode - { - UsingComputationNodeMembers; - public: - MatrixL1RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_gradientOfL1Norm(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - MatrixL1RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_gradientOfL1Norm(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"MatrixL1Reg";} - - virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples) - { - if (inputIndex != 0) - throw std::invalid_argument("MatrixL1RegNode only has one input."); - - ComputeInputPartialS(m_gradientOfL1Norm, Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues()); - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("MatrixL1Reg node should never be in a loop."); - } - - static void WINAPI ComputeInputPartialS(Matrix& gradientOfL1Norm, - Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& inputFunctionValues) - { - gradientOfL1Norm.AssignSignOf(inputFunctionValues); - inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), gradientOfL1Norm); - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - descriptor->MatrixParam(m_gradientOfL1Norm, "gradientOfL1Norm", paramOptionsInput | paramOptionsTemporary); - descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->GradientParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->SetFunction((FARPROC)ComputeInputPartialS); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - virtual void EvaluateThisNode() - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues()); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("MatrixL1Reg node should never be in a loop."); - } - - static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues) - { - functionValues.Resize(1,1); - functionValues.SetValue(inputFunctionValues.MatrixNorm1()); -#if NANCHECK - functionValues.HasNan("MatrixL1Reg"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 1) - throw std::logic_error("MatrixL1Reg criterion should have one input."); - - if (Inputs(0)->FunctionValues().GetNumElements() == 0) - throw std::logic_error("MatrixL1Reg operation: the input node has 0 element."); - - FunctionValues().Resize(1,1); - m_gradientOfL1Norm.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); - CopyImageSizeFromInputs(); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - virtual void AttachInputs(const ComputationNodePtr singleInput) - { - m_children.resize(1); - m_children[0] = singleInput; - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_gradientOfL1Norm.GetDeviceId() != deviceId) - m_gradientOfL1Norm.TransferFromDeviceToDevice(m_gradientOfL1Norm.GetDeviceId(), deviceId,true); - } - } - - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const - { - ComputationNode::CopyTo(nodeP, newName, flags); - MatrixL1RegNode* node = (MatrixL1RegNode*) nodeP; - - if (flags & CopyNodeFlags::copyNodeValue) - { - node->m_gradientOfL1Norm = m_gradientOfL1Norm; - } - } - - // copy constructor - MatrixL1RegNode(const MatrixL1RegNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_gradientOfL1Norm(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new MatrixL1RegNode(this, name, flags); - return node; - } - - private: - // temporary - Matrix m_gradientOfL1Norm; - }; - - template class MatrixL1RegNode; - template class MatrixL1RegNode; - - template - class MatrixL2RegNode : public ComputationNode - { - UsingComputationNodeMembers; - public: - MatrixL2RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_temp(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - MatrixL2RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_temp(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"MatrixL2Reg";} - - - virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples) - { - if (inputIndex != 0) - throw std::invalid_argument("MatrixL2RegNode only has one input."); - - ComputeInputPartialS(Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues(), FunctionValues()); - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("MatrixL2RegNode node should never be in a loop."); - } - - static void WINAPI ComputeInputPartialS(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& /*inputFunctionValues*/, const Matrix& functionValues) - { - ElemType v = gradientValues.Get00Element() / (functionValues.Get00Element() + EPS_IN_INVERSE); - inputGradientValues.AddWithScaleOf(v, gradientValues); - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->GradientParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(); - descriptor->SetFunction((FARPROC)ComputeInputPartialS); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - - virtual void EvaluateThisNode() - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues()); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("MatrixL2RegNode node should never be in a loop."); - } - - static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues) - { - functionValues.Resize(1,1); - functionValues.SetValue(inputFunctionValues.FrobeniusNorm()); -#if NANCHECK - functionValues.HasNan("MatrixL2Reg"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 1) - throw std::logic_error("MatrixL2Reg criterion should have one input."); - - if (Inputs(0)->FunctionValues().GetNumElements() == 0) - throw std::logic_error("MatrixL2Reg operation: the input node has 0 element."); - - FunctionValues().Resize(1,1); - CopyImageSizeFromInputs(); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - virtual void AttachInputs(const ComputationNodePtr singleInput) - { - m_children.resize(1); - m_children[0] = singleInput; - } - - // copy constructor - MatrixL2RegNode(const MatrixL2RegNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_temp(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new MatrixL2RegNode(this, name, flags); - return node; - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_temp.GetDeviceId() != deviceId) - { - m_temp.TransferFromDeviceToDevice(m_temp.GetDeviceId(), deviceId,true); - } - } - } - - private: - Matrix m_temp; - }; - - template class MatrixL2RegNode; - template class MatrixL2RegNode; - - //calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history - template - class ClassBasedCrossEntropyWithSoftmaxNode: public ComputationNode - { - UsingComputationNodeMembers; - public: - ClassBasedCrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logSoftmax(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - m_deviceId = deviceId; - MoveMatricesToDevice(deviceId); - InitRecurrentNode(); - } - - ClassBasedCrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") - : ComputationNode(deviceId), m_logSoftmax(deviceId) - { - m_nodeName = (name == L""? CreateUniqNodeName() : name); - LoadFromFile(fstream, modelVersion, deviceId); - } - - virtual const std::wstring OperationName() const {return TypeName();} - static const std::wstring TypeName() {return L"ClassBasedCrossEntropyWithSoftmax";} - - virtual void ComputeInputPartial(const size_t inputIndex) //scaled by 2*number of colmns (samples) in the Matrix - { - if (inputIndex != 1 && inputIndex != 2) - throw std::invalid_argument("ClassCrossEntropyWithSoftmaxNode criterion only takes with respect to input and weight."); - - if (inputIndex == 1) - ComputeClassEntropyGradientOfInput(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues()); - else - ComputeClassEntropyGradientOfWeight(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues()); - - } - - virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop."); - } - - static void ComputeClassEntropyGradientOfInput(const Matrix& /*inputFunctionValues0*/, const Matrix& /*inputFunctionValues1*/, - const Matrix& inputFunctionValues2, const Matrix* /*clsInfo*/, const Matrix* /*idx2Cls*/, - const Matrix& logSoftmax, Matrix& grd) - { - logSoftmax.ClassEntropyError(logSoftmax); - logSoftmax.ClassEntropyGradientOfInput(logSoftmax, inputFunctionValues2, grd); - } - - static void ComputeClassEntropyGradientOfWeight(const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, - const Matrix& inputFunctionValues2, const Matrix* clsInfo, const Matrix* idx2Cls, - const Matrix& logSoftmax, Matrix& grd) - { - logSoftmax.ClassEntropyGradientOfWeight(logSoftmax, - inputFunctionValues1, inputFunctionValues2, - inputFunctionValues0, - clsInfo, idx2Cls, grd); - } - - // GetTaskDescriptor - Get a task descriptor for this node - // taskType - task type we are generating a task for - virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const - { - TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); - switch(taskType) - { - case taskComputeInputPartial: - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - descriptor->FunctionParam(2, paramOptionsInput); - descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant); - descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant); - descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsInput); - descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); - descriptor->SetFunction(inputIndex==1?(FARPROC)ComputeClassEntropyGradientOfInput:(FARPROC)ComputeClassEntropyGradientOfWeight); - break; - case taskEvaluate: - descriptor->FunctionParam(); - descriptor->FunctionParam(0, paramOptionsInput); - descriptor->FunctionParam(1, paramOptionsInput); - descriptor->FunctionParam(2, paramOptionsInput); - descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant); - descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant); - descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsOutput); - descriptor->SetFunction((FARPROC)EvaluateThisNodeS); - break; - default: - assert(false); - throw std::logic_error("Unsupported task requested"); - } - return descriptor; - } - - virtual void EvaluateThisNode() //-sum(left_i * log(softmax_i(right))) - { - EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax); - } - - virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) - { - throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop."); - } - - static void EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, - const Matrix& inputFunctionValues1, const Matrix& inputFunctionValues2, - const Matrix* clsInfo, const Matrix* idx2Cls, Matrix& logSoftmax) - { - logSoftmax.Resize(inputFunctionValues0.GetNumRows(), inputFunctionValues0.GetNumCols()); - logSoftmax.ClassEntropy(inputFunctionValues1, inputFunctionValues2, inputFunctionValues0, clsInfo, idx2Cls, logSoftmax, functionValues); -#if NANCHECK - functionValues.HasNan("ClassBasedCrossEntropyWithSoftmax"); -#endif - } - - virtual void Validate() - { - PrintSelfBeforeValidation(); - - if (m_children.size() != 3) - throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs."); - - if (Inputs(0)->OperationName() != L"SparseInputValue" - && Inputs(0)->OperationName() != L"InputValue") - throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label."); - - if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed - Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols() && // label and input same obs numbers - Inputs(0)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows() ) ) // label and matrix match output size - { - throw std::logic_error("The Matrix dimension in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match."); - } - - FunctionValues().Resize(1,1); - CopyImageSizeFromInputs(); - } - - virtual void CopyImageSizeFromInputs() - { - CopyImageSizeFromInput(0, false); - - m_outputChannels = 1; - m_outputWidth = 1; - m_outputHeight = 1; - } - - //leftNode should be the empirical - // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class - // the first row indicates the starting row and the second row indicates the end row of a class - virtual void AddClassInfo(Matrix* classinfo, - Matrix* idx2cls) - { - m_ptrClsinfo = classinfo; - m_ptrIdx2Cls = idx2cls; - } - - //leftNode should be the empirical - // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class - // the first row indicates the starting row and the second row indicates the end row of a class - virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr input, - const ComputationNodePtr matrix) - { - m_children.resize(3); - m_children[0] = label; - m_children[1] = input; - m_children[2] = matrix; - - //initializes m_logSoftmax - m_logSoftmax.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC); - m_logSoftmax.Resize(label->FunctionValues().GetNumRows(), label->FunctionValues().GetNumCols()); - } - - virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) - { - ComputationNode::MoveMatricesToDevice(deviceId); - - if (deviceId != AUTOPLACEMATRIX) - { - if (m_logSoftmax.GetDeviceId() != deviceId) - { - m_logSoftmax.TransferFromDeviceToDevice(m_logSoftmax.GetDeviceId(), deviceId,true); - } - } - } - - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const - { - ComputationNode::CopyTo(nodeP, newName, flags); - ClassBasedCrossEntropyWithSoftmaxNode* node = (ClassBasedCrossEntropyWithSoftmaxNode*) nodeP; - - if (flags & CopyNodeFlags::copyNodeValue) - { - node->m_logSoftmax = m_logSoftmax; - } - } - - // copy constructor - ClassBasedCrossEntropyWithSoftmaxNode(const ClassBasedCrossEntropyWithSoftmaxNode* node, const std::wstring& newName, const CopyNodeFlags flags) - : ComputationNode(node->m_deviceId), m_logSoftmax(node->m_deviceId) - { - node->CopyTo(this, newName, flags); - } - - virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const - { - const std::wstring& name = (newName == L"")?NodeName():newName; - - ComputationNodePtr node = new ClassBasedCrossEntropyWithSoftmaxNode(this, name, flags); - return node; - } - - protected: - Matrix m_logSoftmax; - - Matrix* m_ptrClsinfo; - Matrix* m_ptrIdx2Cls; - }; - - template class ClassBasedCrossEntropyWithSoftmaxNode; - template class ClassBasedCrossEntropyWithSoftmaxNode; - +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#pragma once + +#include +#include +#include +#include +#include +#include +#include "ComputationNode.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + //note: to save computation the gradient may be scaled by an constant. + + template + class SquareErrorNode : public ComputationNode + { + UsingComputationNodeMembers; + public: + SquareErrorNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_leftMinusRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + SquareErrorNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_leftMinusRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"SquareError";} + + virtual void ComputeInputPartial(const size_t inputIndex) + { + if (inputIndex > 1) + throw std::invalid_argument("SquareError criteria only takes two inputs."); + + //left Node must be a scalar + if (inputIndex == 0) //left derivative + { + ComputeInputPartialLeft(Inputs(0)->GradientValues(), GradientValues(), m_leftMinusRight); + } + else + { + ComputeInputPartialRight(Inputs(1)->GradientValues(), GradientValues(), m_leftMinusRight); + } + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("SquareError node should never be in a loop."); + } + + static void WINAPI ComputeInputPartialLeft(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& leftMinusRight) + { + inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), leftMinusRight); + } + + static void WINAPI ComputeInputPartialRight(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& leftMinusRight) + { + inputGradientValues.AddWithScaleOf(-gradientValues.Get00Element(), leftMinusRight); + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->GradientParam(); + descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsInput); + descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + descriptor->MatrixParam(m_leftMinusRight, "leftMinusRight", paramOptionsOutput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + virtual void EvaluateThisNode() + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_leftMinusRight); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("SquareError node should never be in a loop."); + } + + static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, Matrix& leftMinusRight) + { + leftMinusRight.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1); + ElemType v = leftMinusRight.FrobeniusNorm(); + functionValues.Resize(1,1); + functionValues.SetValue(v*v/2); +#if NANCHECK + functionValues.HasNan("SquareError"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 2) + throw std::logic_error("SquareError operation requires two inputs."); + + size_t index = 0; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + index = 1; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) + throw std::logic_error("SquareError operation: one of the operants has 0 element."); + + if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size + Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) + { + throw std::logic_error("The Matrix dimension in the SquareError operation does not match."); + } + + FunctionValues().Resize(1,1); + m_leftMinusRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); + CopyImageSizeFromInputs(); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode) + { + m_children.resize(2); + m_children[0] = leftNode; + m_children[1] = rightNode; + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_leftMinusRight.GetDeviceId() != deviceId) + m_leftMinusRight.TransferFromDeviceToDevice(m_leftMinusRight.GetDeviceId(), deviceId,true); + } + } + + virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const + { + ComputationNode::CopyTo(nodeP, newName, flags); + SquareErrorNode* node = (SquareErrorNode*) nodeP; + + if (flags & CopyNodeFlags::copyNodeValue) + { + node->m_leftMinusRight = m_leftMinusRight; + } + } + + // copy constructor + SquareErrorNode(const SquareErrorNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_leftMinusRight(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new SquareErrorNode(this, name, flags); + return node; + } + + private: + Matrix m_leftMinusRight; + }; + + template class SquareErrorNode; + template class SquareErrorNode; + + //calculates: -sum(left_i * log(softmax_i(right))) + template + class CrossEntropyWithSoftmaxNode : public ComputationNode + { + UsingComputationNodeMembers; + public: + CrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + CrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logSoftmaxOfRight(deviceId), m_softmaxOfRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"CrossEntropyWithSoftmax";} + + virtual void ComputeInputPartial(const size_t inputIndex) + { + if (inputIndex > 1) + throw std::invalid_argument("CrossEntropyWithSoftmaxNode criterion only takes two inputs."); + + //left Node must be a scalar + if (inputIndex == 0) //left derivative + { + ComputeInputPartialLeft(m_logSoftmaxOfRight, Inputs(inputIndex)->GradientValues(), GradientValues()); + } + else + { + ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues()); + } + + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop."); + } + + static void WINAPI ComputeInputPartialLeft(const Matrix& logSoftmaxOfRight, Matrix& inputGradientValues, + const Matrix& gradientValues) + { +#if DUMPOUTPUT + logSoftmaxOfRight.Print("CrossEntropyWithSoftmax Partial-logSoftmaxOfRight"); + gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues"); + inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-in"); +#endif + + Matrix::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues); +#if DUMPOUTPUT + inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Left-out"); +#endif + + } + + static void WINAPI ComputeInputPartialRight(const Matrix& softmaxOfRight, const Matrix& inputFunctionValues, + Matrix& inputGradientValues, const Matrix& gradientValues) + { +#if DUMPOUTPUT + softmaxOfRight.Print("CrossEntropyWithSoftmax Partial-softmaxOfRight"); + inputFunctionValues.Print("CrossEntropyWithSoftmax Partial-inputFunctionValues"); + gradientValues.Print("CrossEntropyWithSoftmax Partial-gradientValues"); + inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right-in"); +#endif + + Matrix::AddScaledDifference(gradientValues, softmaxOfRight, inputFunctionValues, inputGradientValues); +#if DUMPOUTPUT + inputGradientValues.Print("CrossEntropyWithSoftmaxNode Partial-Right"); +#endif + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + if (inputIndex == 0) + { + descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsInput); + } + else + { + descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsInput); + descriptor->FunctionParam(0, paramOptionsInput); + } + descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->GradientParam(); + descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + descriptor->MatrixParam(m_softmaxOfRight, "softmaxOfRight", paramOptionsOutput); + descriptor->MatrixParam(m_logSoftmaxOfRight, "logSoftmaxOfRight", paramOptionsOutput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + virtual void EvaluateThisNode() //-sum(left_i * log(softmax_i(right))) + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_softmaxOfRight, m_logSoftmaxOfRight); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("CrossEntropyWithSoftmax node should never be in a loop."); + } + + static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, + Matrix& softmaxOfRight, Matrix& logSoftmaxOfRight) + { + logSoftmaxOfRight.AssignLogSoftmaxOf(inputFunctionValues1, true); + softmaxOfRight.SetValue(logSoftmaxOfRight); + softmaxOfRight.InplaceExp(); + functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logSoftmaxOfRight); + functionValues*=(-1); +#if NANCHECK + functionValues.HasNan("CrossEntropyWithSoftmax"); +#endif +#if DUMPOUTPUT + functionValues.Print("CrossEntropyWithSoftmaxNode"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 2) + throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires two inputs."); + + if (Inputs(0)->OperationName() != L"InputValue" && Inputs(0)->OperationName() != L"SparseInputValue") + throw std::logic_error("CrossEntropyWithSoftmaxNode criterion requires the first input to be the label."); + + //we may release the constraint that the first operant is an inputValue later so the following code should be kept + size_t index = 0; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + index = 1; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) + throw std::logic_error("CrossEntropyWithSoftmaxNode operation: one of the operants has 0 element."); + + if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size + Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) + { + throw std::logic_error("The Matrix dimension in the CrossEntropyWithSoftmaxNode operation does not match."); + } + + FunctionValues().Resize(1,1); + CopyImageSizeFromInputs(); + + m_logSoftmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); + m_softmaxOfRight.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + //leftNode should be the empirical + virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) + { + m_children.resize(2); + m_children[0] = label; + m_children[1] = prediction; + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_logSoftmaxOfRight.GetDeviceId() != deviceId) + { + m_logSoftmaxOfRight.TransferFromDeviceToDevice(m_logSoftmaxOfRight.GetDeviceId(), deviceId,true); + } + if (m_softmaxOfRight.GetDeviceId() != deviceId) + { + m_softmaxOfRight.TransferFromDeviceToDevice(m_softmaxOfRight.GetDeviceId(), deviceId,true); + } + } + } + + virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const + { + ComputationNode::CopyTo(nodeP, newName, flags); + CrossEntropyWithSoftmaxNode* node = (CrossEntropyWithSoftmaxNode*) nodeP; + + if (flags & CopyNodeFlags::copyNodeValue) + { + node->m_logSoftmaxOfRight = m_logSoftmaxOfRight; + node->m_softmaxOfRight = m_softmaxOfRight; + } + } + + // copy constructor + CrossEntropyWithSoftmaxNode(const CrossEntropyWithSoftmaxNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_logSoftmaxOfRight(node->m_deviceId), m_softmaxOfRight(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new CrossEntropyWithSoftmaxNode(this, name, flags); + return node; + } + + protected: + Matrix m_logSoftmaxOfRight; + Matrix m_softmaxOfRight; + }; + + template class CrossEntropyWithSoftmaxNode; + template class CrossEntropyWithSoftmaxNode; + + //calculates: -sum(left_i * log(right_i)) + //assume softmax is already done + template + class CrossEntropyNode : public ComputationNode + { + UsingComputationNodeMembers; + public: + CrossEntropyNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + CrossEntropyNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logOfRight(deviceId), m_leftDivRight(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"CrossEntropy";} + + virtual void ComputeInputPartial(const size_t inputIndex) + { + if (inputIndex > 1) + throw std::invalid_argument("CrossEntropy criterion only takes two inputs."); + + //left Node must be a scalar + if (inputIndex == 0) //left derivative + { + ComputeInputPartialLeft(m_logOfRight, Inputs(inputIndex)->GradientValues(), GradientValues()); + } + else + { + ComputeInputPartialRight(m_leftDivRight, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues()); + } + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("CrossEntropy node should never be in a loop."); + } + + static void WINAPI ComputeInputPartialLeft(const Matrix& logOfRight, Matrix& inputGradientValues, + const Matrix& gradientValues) + { + Matrix::ScaleAndAdd(-gradientValues.Get00Element(), logOfRight, inputGradientValues); + } + + static void WINAPI ComputeInputPartialRight(Matrix& leftDivRight, + const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, + Matrix& inputGradientValues, const Matrix& gradientValues) + { + leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1); + + Matrix::ScaleAndAdd(-gradientValues.Get00Element(), leftDivRight, inputGradientValues); + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + if (inputIndex == 0) + { + descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsInput); + } + else + { + descriptor->MatrixParam(m_leftDivRight, "leftDivRight", paramOptionsInput | paramOptionsTemporary); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + } + descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->GradientParam(); + descriptor->SetFunction(inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + descriptor->MatrixParam(m_logOfRight, "logOfRight", paramOptionsOutput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + + virtual void EvaluateThisNode() //-sum(left_i * log(right_i)) + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_logOfRight); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("CrossEntropy node should never be in a loop."); + } + + static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, + Matrix& logOfRight) + { + logOfRight.SetValue(inputFunctionValues1); + logOfRight.InplaceLog(); + functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logOfRight); + functionValues*=(-1); +#if NANCHECK + functionValues.HasNan("CrossEntropy"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 2) + throw std::logic_error("CrossEntropyNode criterion requires two inputs."); + + if (Inputs(0)->OperationName() != L"InputValue") + throw std::logic_error("CrossEntropyNode criterion requires the first input to be the label."); + + //we may release the constraint that the first operant is an inputValue later so the following code should be kept + size_t index = 0; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + index = 1; + if (Inputs(index)->OperationName() == LearnableParameter::TypeName()) + { + size_t rows = Inputs(index)->FunctionValues().GetNumRows() == 0? Inputs(1-index)->FunctionValues().GetNumRows() : Inputs(index)->FunctionValues().GetNumRows(); + size_t cols = Inputs(index)->FunctionValues().GetNumCols() == 0? Inputs(1-index)->FunctionValues().GetNumCols() : Inputs(index)->FunctionValues().GetNumCols(); + Inputs(index)->FunctionValues().Resize(rows, cols); + } + + if (Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) + throw std::logic_error("CrossEntropyNode operation: one of the operants has 0 element."); + + if (!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows() && //match size + Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) ) + { + throw std::logic_error("The Matrix dimension in the CrossEntropyNode operation does not match."); + } + + FunctionValues().Resize(1,1); + m_logOfRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols()); + m_leftDivRight.Resize(Inputs(1)->FunctionValues().GetNumRows(), Inputs(1)->FunctionValues().GetNumCols()); + CopyImageSizeFromInputs(); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + //leftNode should be the empirical + virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr prediction) + { + m_children.resize(2); + m_children[0] = label; + m_children[1] = prediction; + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_logOfRight.GetDeviceId() != deviceId) + { + m_logOfRight.TransferFromDeviceToDevice(m_logOfRight.GetDeviceId(), deviceId,true); + } + if (m_leftDivRight.GetDeviceId() != deviceId) + { + m_leftDivRight.TransferFromDeviceToDevice(m_leftDivRight.GetDeviceId(), deviceId,true); + } + } + } + + virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const + { + ComputationNode::CopyTo(nodeP, newName, flags); + CrossEntropyNode* node = (CrossEntropyNode*) nodeP; + + if (flags & CopyNodeFlags::copyNodeValue) + { + node->m_logOfRight = m_logOfRight; + node->m_leftDivRight = m_leftDivRight; + } + } + + // copy constructor + CrossEntropyNode(const CrossEntropyNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_logOfRight(node->m_deviceId), m_leftDivRight(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new CrossEntropyNode(this, name, flags); + return node; + } + + private: + // matrix value passed from evaluate to computePartial + Matrix m_logOfRight; + // temporary + Matrix m_leftDivRight; + }; + + template class CrossEntropyNode; + template class CrossEntropyNode; + + template + class MatrixL1RegNode : public ComputationNode + { + UsingComputationNodeMembers; + public: + MatrixL1RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_gradientOfL1Norm(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + MatrixL1RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_gradientOfL1Norm(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"MatrixL1Reg";} + + virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples) + { + if (inputIndex != 0) + throw std::invalid_argument("MatrixL1RegNode only has one input."); + + ComputeInputPartialS(m_gradientOfL1Norm, Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues()); + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("MatrixL1Reg node should never be in a loop."); + } + + static void WINAPI ComputeInputPartialS(Matrix& gradientOfL1Norm, + Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& inputFunctionValues) + { + gradientOfL1Norm.AssignSignOf(inputFunctionValues); + inputGradientValues.AddWithScaleOf(gradientValues.Get00Element(), gradientOfL1Norm); + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + descriptor->MatrixParam(m_gradientOfL1Norm, "gradientOfL1Norm", paramOptionsInput | paramOptionsTemporary); + descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->GradientParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->SetFunction((FARPROC)ComputeInputPartialS); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + virtual void EvaluateThisNode() + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues()); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("MatrixL1Reg node should never be in a loop."); + } + + static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues) + { + functionValues.Resize(1,1); + functionValues.SetValue(inputFunctionValues.MatrixNorm1()); +#if NANCHECK + functionValues.HasNan("MatrixL1Reg"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 1) + throw std::logic_error("MatrixL1Reg criterion should have one input."); + + if (Inputs(0)->FunctionValues().GetNumElements() == 0) + throw std::logic_error("MatrixL1Reg operation: the input node has 0 element."); + + FunctionValues().Resize(1,1); + m_gradientOfL1Norm.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols()); + CopyImageSizeFromInputs(); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + virtual void AttachInputs(const ComputationNodePtr singleInput) + { + m_children.resize(1); + m_children[0] = singleInput; + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_gradientOfL1Norm.GetDeviceId() != deviceId) + m_gradientOfL1Norm.TransferFromDeviceToDevice(m_gradientOfL1Norm.GetDeviceId(), deviceId,true); + } + } + + virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const + { + ComputationNode::CopyTo(nodeP, newName, flags); + MatrixL1RegNode* node = (MatrixL1RegNode*) nodeP; + + if (flags & CopyNodeFlags::copyNodeValue) + { + node->m_gradientOfL1Norm = m_gradientOfL1Norm; + } + } + + // copy constructor + MatrixL1RegNode(const MatrixL1RegNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_gradientOfL1Norm(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new MatrixL1RegNode(this, name, flags); + return node; + } + + private: + // temporary + Matrix m_gradientOfL1Norm; + }; + + template class MatrixL1RegNode; + template class MatrixL1RegNode; + + template + class MatrixL2RegNode : public ComputationNode + { + UsingComputationNodeMembers; + public: + MatrixL2RegNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_temp(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + MatrixL2RegNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_temp(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"MatrixL2Reg";} + + + virtual void ComputeInputPartial(const size_t inputIndex) // scale by number of cols (or samples) + { + if (inputIndex != 0) + throw std::invalid_argument("MatrixL2RegNode only has one input."); + + ComputeInputPartialS(Inputs(0)->GradientValues(), GradientValues(), Inputs(0)->FunctionValues(), FunctionValues()); + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("MatrixL2RegNode node should never be in a loop."); + } + + static void WINAPI ComputeInputPartialS(Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& /*inputFunctionValues*/, const Matrix& functionValues) + { + ElemType v = gradientValues.Get00Element() / (functionValues.Get00Element() + EPS_IN_INVERSE); + inputGradientValues.AddWithScaleOf(v, gradientValues); + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + descriptor->GradientParam(0, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->GradientParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(); + descriptor->SetFunction((FARPROC)ComputeInputPartialS); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + + virtual void EvaluateThisNode() + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues()); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("MatrixL2RegNode node should never be in a loop."); + } + + static void WINAPI EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues) + { + functionValues.Resize(1,1); + functionValues.SetValue(inputFunctionValues.FrobeniusNorm()); +#if NANCHECK + functionValues.HasNan("MatrixL2Reg"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 1) + throw std::logic_error("MatrixL2Reg criterion should have one input."); + + if (Inputs(0)->FunctionValues().GetNumElements() == 0) + throw std::logic_error("MatrixL2Reg operation: the input node has 0 element."); + + FunctionValues().Resize(1,1); + CopyImageSizeFromInputs(); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + virtual void AttachInputs(const ComputationNodePtr singleInput) + { + m_children.resize(1); + m_children[0] = singleInput; + } + + // copy constructor + MatrixL2RegNode(const MatrixL2RegNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_temp(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new MatrixL2RegNode(this, name, flags); + return node; + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_temp.GetDeviceId() != deviceId) + { + m_temp.TransferFromDeviceToDevice(m_temp.GetDeviceId(), deviceId,true); + } + } + } + + private: + Matrix m_temp; + }; + + template class MatrixL2RegNode; + template class MatrixL2RegNode; + + //calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history + template + class ClassBasedCrossEntropyWithSoftmaxNode: public ComputationNode + { + UsingComputationNodeMembers; + public: + ClassBasedCrossEntropyWithSoftmaxNode(const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logSoftmax(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + m_deviceId = deviceId; + MoveMatricesToDevice(deviceId); + InitRecurrentNode(); + } + + ClassBasedCrossEntropyWithSoftmaxNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId=AUTOPLACEMATRIX, const std::wstring name = L"") + : ComputationNode(deviceId), m_logSoftmax(deviceId) + { + m_nodeName = (name == L""? CreateUniqNodeName() : name); + LoadFromFile(fstream, modelVersion, deviceId); + } + + virtual const std::wstring OperationName() const {return TypeName();} + static const std::wstring TypeName() {return L"ClassBasedCrossEntropyWithSoftmax";} + + virtual void ComputeInputPartial(const size_t inputIndex) //scaled by 2*number of colmns (samples) in the Matrix + { + if (inputIndex != 1 && inputIndex != 2) + throw std::invalid_argument("ClassCrossEntropyWithSoftmaxNode criterion only takes with respect to input and weight."); + + if (inputIndex == 1) + ComputeClassEntropyGradientOfInput(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues()); + else + ComputeClassEntropyGradientOfWeight(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax, Inputs(inputIndex)->GradientValues()); + + } + + virtual void ComputeInputPartial(const size_t /*inputIndex*/, const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop."); + } + + static void ComputeClassEntropyGradientOfInput(const Matrix& /*inputFunctionValues0*/, const Matrix& /*inputFunctionValues1*/, + const Matrix& inputFunctionValues2, const Matrix* /*clsInfo*/, const Matrix* /*idx2Cls*/, + const Matrix& logSoftmax, Matrix& grd) + { + logSoftmax.ClassEntropyError(logSoftmax); + logSoftmax.ClassEntropyGradientOfInput(logSoftmax, inputFunctionValues2, grd); + } + + static void ComputeClassEntropyGradientOfWeight(const Matrix& inputFunctionValues0, const Matrix& inputFunctionValues1, + const Matrix& inputFunctionValues2, const Matrix* clsInfo, const Matrix* idx2Cls, + const Matrix& logSoftmax, Matrix& grd) + { + logSoftmax.ClassEntropyGradientOfWeight(logSoftmax, + inputFunctionValues1, inputFunctionValues2, + inputFunctionValues0, + clsInfo, idx2Cls, grd); + } + + // GetTaskDescriptor - Get a task descriptor for this node + // taskType - task type we are generating a task for + virtual TaskDescriptor* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const + { + TaskDescriptor* descriptor = new TaskDescriptor(this, taskType, inputIndex); + switch(taskType) + { + case taskComputeInputPartial: + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + descriptor->FunctionParam(2, paramOptionsInput); + descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant); + descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant); + descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsInput); + descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize); + descriptor->SetFunction(inputIndex==1?(FARPROC)ComputeClassEntropyGradientOfInput:(FARPROC)ComputeClassEntropyGradientOfWeight); + break; + case taskEvaluate: + descriptor->FunctionParam(); + descriptor->FunctionParam(0, paramOptionsInput); + descriptor->FunctionParam(1, paramOptionsInput); + descriptor->FunctionParam(2, paramOptionsInput); + descriptor->MatrixParam(*m_ptrClsinfo, "clsInfo", paramOptionsInput | paramOptionsConstant); + descriptor->MatrixParam(*m_ptrIdx2Cls, "idx2Cls", paramOptionsInput | paramOptionsConstant); + descriptor->MatrixParam(m_logSoftmax, "logSoftmax", paramOptionsOutput); + descriptor->SetFunction((FARPROC)EvaluateThisNodeS); + break; + default: + assert(false); + throw std::logic_error("Unsupported task requested"); + } + return descriptor; + } + + virtual void EvaluateThisNode() //-sum(left_i * log(softmax_i(right))) + { + EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), m_ptrClsinfo, m_ptrIdx2Cls, m_logSoftmax); + } + + virtual void EvaluateThisNode(const size_t /*timeIdxInSeq*/) + { + throw std::logic_error("ClassCrossEntropyWithSoftmax node should never be in a loop."); + } + + static void EvaluateThisNodeS(Matrix& functionValues, const Matrix& inputFunctionValues0, + const Matrix& inputFunctionValues1, const Matrix& inputFunctionValues2, + const Matrix* clsInfo, const Matrix* idx2Cls, Matrix& logSoftmax) + { + logSoftmax.Resize(inputFunctionValues0.GetNumRows(), inputFunctionValues0.GetNumCols()); + logSoftmax.ClassEntropy(inputFunctionValues1, inputFunctionValues2, inputFunctionValues0, clsInfo, idx2Cls, logSoftmax, functionValues); +#if NANCHECK + functionValues.HasNan("ClassBasedCrossEntropyWithSoftmax"); +#endif + } + + virtual void Validate() + { + PrintSelfBeforeValidation(); + + if (m_children.size() != 3) + throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs."); + + if (Inputs(0)->OperationName() != SparseInputValue::TypeName() + && Inputs(0)->OperationName() != InputValue::TypeName()) + throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label."); + + if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed + Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols() && // label and input same obs numbers + Inputs(0)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumRows() ) ) // label and matrix match output size + { + throw std::logic_error("The Matrix dimension in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match."); + } + + FunctionValues().Resize(1,1); + CopyImageSizeFromInputs(); + } + + virtual void CopyImageSizeFromInputs() + { + CopyImageSizeFromInput(0, false); + + m_outputChannels = 1; + m_outputWidth = 1; + m_outputHeight = 1; + } + + //leftNode should be the empirical + // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class + // the first row indicates the starting row and the second row indicates the end row of a class + virtual void AddClassInfo(Matrix* classinfo, + Matrix* idx2cls) + { + m_ptrClsinfo = classinfo; + m_ptrIdx2Cls = idx2cls; + } + + //leftNode should be the empirical + // classinfo is a matrix of N columns and 2 rows. N columns correspond to N class + // the first row indicates the starting row and the second row indicates the end row of a class + virtual void AttachInputs(const ComputationNodePtr label, const ComputationNodePtr input, + const ComputationNodePtr matrix) + { + m_children.resize(3); + m_children[0] = label; + m_children[1] = input; + m_children[2] = matrix; + + //initializes m_logSoftmax + m_logSoftmax.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC); + m_logSoftmax.Resize(label->FunctionValues().GetNumRows(), label->FunctionValues().GetNumCols()); + } + + virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) + { + ComputationNode::MoveMatricesToDevice(deviceId); + + if (deviceId != AUTOPLACEMATRIX) + { + if (m_logSoftmax.GetDeviceId() != deviceId) + { + m_logSoftmax.TransferFromDeviceToDevice(m_logSoftmax.GetDeviceId(), deviceId,true); + } + } + } + + virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const + { + ComputationNode::CopyTo(nodeP, newName, flags); + ClassBasedCrossEntropyWithSoftmaxNode* node = (ClassBasedCrossEntropyWithSoftmaxNode*) nodeP; + + if (flags & CopyNodeFlags::copyNodeValue) + { + node->m_logSoftmax = m_logSoftmax; + } + } + + // copy constructor + ClassBasedCrossEntropyWithSoftmaxNode(const ClassBasedCrossEntropyWithSoftmaxNode* node, const std::wstring& newName, const CopyNodeFlags flags) + : ComputationNode(node->m_deviceId), m_logSoftmax(node->m_deviceId) + { + node->CopyTo(this, newName, flags); + } + + virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const + { + const std::wstring& name = (newName == L"")?NodeName():newName; + + ComputationNodePtr node = new ClassBasedCrossEntropyWithSoftmaxNode(this, name, flags); + return node; + } + + protected: + Matrix m_logSoftmax; + + Matrix* m_ptrClsinfo; + Matrix* m_ptrIdx2Cls; + }; + + template class ClassBasedCrossEntropyWithSoftmaxNode; + template class ClassBasedCrossEntropyWithSoftmaxNode; + }}} \ No newline at end of file diff --git a/MachineLearning/cn/cn.cpp b/MachineLearning/cn/cn.cpp index d40e369c6..40584fa83 100644 --- a/MachineLearning/cn/cn.cpp +++ b/MachineLearning/cn/cn.cpp @@ -1,738 +1,771 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// cn.cpp : Defines the entry point for the console application. -// - -#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _ - -#include "stdafx.h" -#include "ComputationNetwork.h" -#include "ComputationNode.h" -#include "DataReader.h" -#include "DataWriter.h" -#include "SimpleNetworkBuilder.h" -#include "NDLNetworkBuilder.h" -#include "SynchronousExecutionEngine.h" -#include "ModelEditLanguage.h" -#include "SGD.h" -#include -#include "commandArgUtil.h" -#include "SimpleEvaluator.h" -#include "SimpleOutputWriter.h" -#include -#include -#if defined(_WIN32) -#include "io.h" -#endif -#include "hostname.h" -#ifdef LEAKDETECT -#include "vld.h" // for memory leak detection -#endif -#include -#include "BestGpu.h" - -// MPI builds on windows require the following installed to "c:\program files\Microsoft MPI\" -// HPC Pack 2012 R2 MS-MPI Redistributable Package -// http://www.microsoft.com/en-us/download/details.aspx?id=41634 - -#ifdef MPI_SUPPORT -#include "mpi.h" -#pragma comment(lib, "msmpi.lib") -#endif -int numProcs; -int myRank; - -using namespace std; -using namespace Microsoft::MSR::CNTK; - -// internal test routine forward declaration -template -void TestCn(const ConfigParameters& config); - -void RedirectStdErr(wstring logpath) -{ - fprintf (stderr, "Redirecting stderr to file %S\n", logpath.c_str()); - msra::files::make_intermediate_dirs (logpath); - auto_file_ptr f (logpath.c_str(), "wb"); - if (dup2 (fileno (f), 2) == -1) - RuntimeError ("unexpected failure to redirect stderr to log file"); - setvbuf (stderr, NULL, _IONBF, 16384); // unbuffer it -} - -std::string WCharToString(const wchar_t* wst) -{ - std::wstring ws(wst); - std::string s(ws.begin(), ws.end()); - s.assign(ws.begin(), ws.end()); - return s; -} - -template -void DumpNodeInfo(const ConfigParameters& config) -{ - wstring modelPath = config("modelPath"); - wstring nodeName = config("nodeName",L"__AllNodes__"); - wstring defOutFilePath = modelPath + L"." + nodeName + L".txt"; - wstring outputFile = config("outputFile", WCharToString(defOutFilePath.c_str()).c_str()); - bool printValues = config("printValues", "true"); - - ComputationNetwork net(-1); //always use CPU - net.LoadFromFile(modelPath); - net.DumpNodeInfoToFile(nodeName, printValues, outputFile); -} - -template -void DoEvalBase(const ConfigParameters& config, IDataReader& reader) -{ - DEVICEID_TYPE deviceId = DeviceFromConfig(config); - ConfigArray minibatchSize = config("minibatchSize", "40960"); - size_t epochSize = config("epochSize", "0"); - if (epochSize == 0) - { - epochSize = requestDataSize; - } - wstring modelPath = config("modelPath"); - intargvector mbSize = minibatchSize; - - int traceLevel = config("traceLevel", "0"); - size_t numMBsToShowResult = config("numMBsToShowResult", "100"); - - ConfigArray evalNodeNames = config("evalNodeNames",""); - vector evalNodeNamesVector; - for (int i=0; i < evalNodeNames.size(); ++i) - { - evalNodeNamesVector.push_back(evalNodeNames[i]); - } - - ComputationNetwork net(deviceId); - net.LoadFromFile(modelPath); - net.ResetEvalTimeStamp(); - - SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); - eval.Evaluate(reader, evalNodeNamesVector, mbSize[0], epochSize); -} - -template -void DoEval(const ConfigParameters& config) -{ - //test - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - - DataReader testDataReader(readerConfig); - - DoEvalBase(config, testDataReader); -} - -template -void DoEvalUnroll(const ConfigParameters& config) -{ - //test - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - - DataReader testDataReader(readerConfig); - - DEVICEID_TYPE deviceId = DeviceFromConfig(config); - ConfigArray minibatchSize = config("minibatchSize", "40960"); - size_t epochSize = config("epochSize", "0"); - if (epochSize == 0) - { - epochSize = requestDataSize; - } - wstring modelPath = config("modelPath"); - intargvector mbSize = minibatchSize; - wstring path2EvalResults = config("path2EvalResults", L""); - - ComputationNetwork net(deviceId); - net.LoadFromFile(modelPath); - net.ResetEvalTimeStamp(); - - SimpleEvaluator eval(net); - ElemType evalEntropy; - eval.EvaluateUnroll(testDataReader, mbSize[0], evalEntropy, path2EvalResults == L""? nullptr : path2EvalResults.c_str(), epochSize); -} - -template -void DoCrossValidate(const ConfigParameters& config) -{ - //test - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - - DEVICEID_TYPE deviceId = DeviceFromConfig(config); - ConfigArray minibatchSize = config("minibatchSize", "40960"); - size_t epochSize = config("epochSize", "0"); - if (epochSize == 0) - { - epochSize = requestDataSize; - } - wstring modelPath = config("modelPath"); - intargvector mbSize = minibatchSize; - - ConfigArray cvIntervalConfig = config("crossValidationInterval"); - intargvector cvInterval = cvIntervalConfig; - - size_t sleepSecondsBetweenRuns = config("sleepTimeBetweenRuns", "0"); - - int traceLevel = config("traceLevel", "0"); - size_t numMBsToShowResult = config("numMBsToShowResult", "100"); - - ConfigArray evalNodeNames = config("evalNodeNames",""); - vector evalNodeNamesVector; - for (int i=0; i < evalNodeNames.size(); ++i) - { - evalNodeNamesVector.push_back(evalNodeNames[i]); - } - - std::vector> cvErrorResults; - std::vector cvModels; - - DataReader cvDataReader(readerConfig); - - bool finalModelEvaluated = false; - for (size_t i=cvInterval[0]; i<=cvInterval[2]; i+=cvInterval[1]) - { - wstring cvModelPath = msra::strfun::wstrprintf (L"%ls.%lld", modelPath.c_str(), i); - - if (!fexists (cvModelPath)) - { - fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str()); - if (finalModelEvaluated || !fexists (modelPath)) - continue; // file missing - else - { - cvModelPath = modelPath; - finalModelEvaluated = true; - } - } - - cvModels.push_back(cvModelPath); - ComputationNetwork net(deviceId); - net.LoadFromFile(cvModelPath); - net.ResetEvalTimeStamp(); - - SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); - - fprintf(stderr, "model %ls --> \n",cvModelPath.c_str()); - std::vector evalErrors; - evalErrors = eval.Evaluate(cvDataReader, evalNodeNamesVector, mbSize[0], epochSize); - cvErrorResults.push_back(evalErrors); - - ::Sleep(1000*sleepSecondsBetweenRuns); - } - - //find best model - if (cvErrorResults.size() == 0) - throw std::logic_error("No model is evaluated."); - - std::vector minErrors; - std::vector minErrIds; - std::vector evalErrors = cvErrorResults[0]; - for (int i=0; i < evalErrors.size(); ++i) - { - minErrors.push_back(evalErrors[i]); - minErrIds.push_back(0); - } - - for (int i=0; i -void DoWriteOutput(const ConfigParameters& config) -{ - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - readerConfig.Insert("randomize","None"); //we don't want randomization when output results - - DataReader testDataReader(readerConfig); - - DEVICEID_TYPE deviceId = DeviceFromConfig(config); - ConfigArray minibatchSize = config("minibatchSize", "2048"); - wstring modelPath = config("modelPath"); - intargvector mbSize = minibatchSize; - - size_t epochSize = config("epochSize", "0"); - if (epochSize == 0) - { - epochSize = requestDataSize; - } - - ConfigArray outputNodeNames = config("outputNodeNames",""); - vector outputNodeNamesVector; - for (int i=0; i < outputNodeNames.size(); ++i) - { - outputNodeNamesVector.push_back(outputNodeNames[i]); - } - - ComputationNetwork net(deviceId); - net.LoadFromFile(modelPath); - net.ResetEvalTimeStamp(); - - SimpleOutputWriter writer(net, 1); - - if (config.Exists("writer")) - { - ConfigParameters writerConfig (config("writer")); - bool bWriterUnittest = writerConfig("unittest","false"); - DataWriter testDataWriter(writerConfig); - writer.WriteOutput(testDataReader,mbSize[0], testDataWriter, outputNodeNamesVector, epochSize, bWriterUnittest); - } - else if (config.Exists("outputPath")) - { - wstring outputPath = config("outputPath"); // crashes if no default given? - writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, epochSize); - } - //writer.WriteOutput(testDataReader, mbSize[0], testDataWriter, outputNodeNamesVector, epochSize); -} - -namespace Microsoft { namespace MSR { namespace CNTK { - -TrainingCriterion ParseTrainingCriterionString(wstring s) -{ - msra::strfun::tolower_ascii(s); - if (s==L"crossentropywithsoftmax") - return TrainingCriterion::CrossEntropyWithSoftmax; - else if (s==L"squareerror") - return TrainingCriterion::SquareError; - else if (s!=L"classcrossentropywithsoftmax") // (twisted logic to keep compiler happy w.r.t. not returning from LogicError) - LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (CrossEntropyWithSoftmax | SquareError | ClassCrossEntropyWithSoftmax)"); - return TrainingCriterion::ClassCrossEntropyWithSoftmax; -} - -EvalCriterion ParseEvalCriterionString(wstring s) -{ - msra::strfun::tolower_ascii(s); - if (s==L"errorprediction") - return EvalCriterion::ErrorPrediction; - else if (s==L"crossentropywithsoftmax") - return EvalCriterion::CrossEntropyWithSoftmax; - else if (s==L"classcrossentropywithsoftmax") - return EvalCriterion::ClassCrossEntropyWithSoftmax; - else if (s!=L"squareerror") - LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (ErrorPrediction | CrossEntropyWithSoftmax | SquareError)"); - return EvalCriterion::SquareError; -} - -}}}; - -template -void DoCreateLabelMap(const ConfigParameters& config) -{ - // this gets the section name we are interested in - std::string section = config("section"); - // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution) - ConfigParameters configSection (config(section)); - ConfigParameters readerConfig (configSection("reader")); - readerConfig.Insert("allowMapCreation","true"); - DEVICEID_TYPE deviceId = CPUDEVICE; - size_t minibatchSize = config("minibatchSize", "2048"); - int traceLevel = config("traceLevel","0"); - std::vector featureNames; - std::vector labelNames; - GetFileConfigNames(readerConfig, featureNames, labelNames); - - // setup minibatch matrices - Matrix featuresMatrix(deviceId); - Matrix labelsMatrix(deviceId); - std::map*> matrices; - matrices[featureNames[0]] = &featuresMatrix; - if (labelNames.size() == 0) - RuntimeError("CreateLabelMap: no labels found to process"); - - // now create the reader and loop through the entire dataset to get all the labels - auto start = std::chrono::system_clock::now(); - for (const std::wstring& labelsName: labelNames) - { - // take the last label file defined (the other one might be input) - matrices[labelsName] = &labelsMatrix; - - // get the label mapping file name - ConfigParameters labelConfig (readerConfig(labelsName)); - std::string labelMappingFile; - if (labelConfig.ExistsCurrent("labelMappingFile")) - labelMappingFile = labelConfig("labelMappingFile"); - else if (readerConfig.ExistsCurrent("labelMappingFile")) - labelMappingFile = labelConfig("labelMappingFile"); - else - RuntimeError("CreateLabelMap: No labelMappingFile defined"); - - if (fexists(labelMappingFile)) - { - fprintf(stderr,"CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str()); - return; - } - fprintf(stderr,"CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str()); - - DataReader dataReader(readerConfig); - - dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize); - int count = 0; - while (dataReader.GetMinibatch(matrices)) - { - Matrix& features = *matrices[featureNames[0]]; - count += features.GetNumCols(); - if (traceLevel > 1) - fprintf(stderr,"."); // progress meter - } - dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize); - - // print the results - if (traceLevel > 0) - fprintf(stderr,"\nread %d labels and produced %s\n", count, labelMappingFile.c_str()); - } - auto end = std::chrono::system_clock::now(); - auto elapsed = end-start; - if (traceLevel > 1) - fprintf(stderr, "%f seconds elapsed\n", (float)(std::chrono::duration_cast(elapsed).count())/1000); -} - - -template -void DoTrain(const ConfigParameters& config) -{ - ConfigParameters configSGD (config("SGD")); - bool makeMode = config("makeMode", "true"); - - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - - IComputationNetBuilder* netBuilder = NULL; - - if (config.Exists("NDLNetworkBuilder")) - { - ConfigParameters configNDL (config("NDLNetworkBuilder")); - netBuilder = (IComputationNetBuilder*)new NDLBuilder(configNDL); - } - else if (config.Exists("SimpleNetworkBuilder")) - { - ConfigParameters configSNB (config("SimpleNetworkBuilder")); - netBuilder = (IComputationNetBuilder*)new SimpleNetworkBuilder(configSNB); - } - else - { - RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified" ); - } - - DataReader* dataReader = new DataReader(readerConfig); - - DataReader* cvDataReader = nullptr; - ConfigParameters cvReaderConfig (config("cvReader", L"")); - - if (cvReaderConfig.size() != 0) - { - cvReaderConfig.Insert("traceLevel",config("traceLevel","0")); - cvDataReader = new DataReader(cvReaderConfig); - } - - SGD sgd(configSGD); - - sgd.Train(netBuilder, dataReader, cvDataReader, makeMode); - - delete netBuilder; - delete dataReader; - delete cvDataReader; -} - -template -void DoAdapt(const ConfigParameters& config) -{ - DEVICEID_TYPE deviceId = DeviceFromConfig(config); - - ConfigParameters configSGD (config("SGD")); - bool makeMode = config("makeMode", "true"); - - ConfigParameters readerConfig (config("reader")); - readerConfig.Insert("traceLevel",config("traceLevel","0")); - - DataReader* dataReader = new DataReader(readerConfig); - - DataReader* cvDataReader = nullptr; - ConfigParameters cvReaderConfig (config("cvReader", L"")); - - if (cvReaderConfig.size() != 0) - { - cvReaderConfig.Insert("traceLevel",config("traceLevel","0")); - cvDataReader = new DataReader(cvReaderConfig); - } - - wstring origModelFileName = config("origModelFileName", L""); - wstring refNodeName = config("refNodeName", L""); - - SGD sgd(configSGD); - - sgd.Adapt(origModelFileName, refNodeName, dataReader, cvDataReader, deviceId, makeMode); - - delete dataReader; - delete cvDataReader; -} - -template -void DoEdit(const ConfigParameters& config) -{ - wstring editPath = config("editPath"); - wstring ndlMacros = config("ndlMacros",""); - NDLScript ndlScript; - if (!ndlMacros.empty()) - ndlScript.LoadConfigFile(ndlMacros); - MELScript melScript; - melScript.LoadConfigFileAndResolveVariables(editPath, config); -} - -template -void DoConvertFromDbn(const ConfigParameters& config) -{ - //config.Insert("deviceId","-1"); //force using CPU - - wstring modelPath = config("modelPath"); - wstring dbnModelPath = config("dbnModelPath"); - - IComputationNetBuilder* netBuilder = (IComputationNetBuilder*)new SimpleNetworkBuilder(config); - ComputationNetwork& net = netBuilder->LoadNetworkFromFile(dbnModelPath); - net.SaveToFile(modelPath); - delete (netBuilder); -} -// process the command -template -void DoCommand(const ConfigParameters& config) -{ - ConfigArray command = config("command", "train"); - for (int i=0; i < command.size(); i++) - { - //get the configuration parameters that match the command - ConfigParameters commandParams (config(command[i])); - ConfigArray action = commandParams("action","train"); - - // determine the action to perform, and do it - for (int j=0; j < action.size(); j++) - { - if (action[j] == "train" || action[j] == "trainRNN") - DoTrain(commandParams); - else if (action[j] == "adapt") - DoAdapt(commandParams); - else if (action[j] == "test" || action[j] == "eval") - DoEval(commandParams); - else if (action[j] == "testunroll") - DoEvalUnroll(commandParams); - else if (action[j] == "edit") - DoEdit(commandParams); - else if (action[j] == "cv") - DoCrossValidate(commandParams); - else if (action[j] == "write") - DoWriteOutput(commandParams); - else if (action[j] == "devtest") - TestCn(config); // for "devtest" action pass the root config instead - else if (action[j] == "dumpnode") - DumpNodeInfo(commandParams); - else if (action[j] == "convertdbn") - DoConvertFromDbn(commandParams); - else if (action[j] == "createLabelMap") - DoCreateLabelMap(commandParams); - else - RuntimeError("unknown action: %s in command set: %s", action[j].c_str(), command[i].c_str()); - - NDLScript ndlScript; - ndlScript.ClearGlobal(); // clear global macros between commands - } - } -} - -std::string TimeDateStamp() -{ -#if 0 // "safe" version for Windows, not needed it seems - __time64_t localtime; - - _time64 (&localtime);// get current time and date - struct tm now; - _localtime64_s (&now, &localtime); // convert -#else - time_t t = time(NULL); - struct tm now = *localtime(&t); -#endif - char buf[30]; - sprintf (buf, "%04d/%02d/%02d %02d:%02d:%02d", now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec); - return buf; -} - -#ifdef MPI_SUPPORT -// Oh, my gosh, this is going to be ugly. MPI_INIT needs a char* argv[], so let's interface. -int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv) -{ - // this maps from the strings - std::map recover_wstring; - - // do the mapping to 8-bit encoding for MPI_Init() - vector> argv_string_vector; - transform(*argv, *argv + *argc, std::back_inserter(argv_string_vector), - [&recover_wstring](wchar_t*pws)->vector - { - std::string tmp = msra::strfun::utf8(std::wstring(pws)); - recover_wstring[tmp] = pws; - vector rv(tmp.begin(), tmp.end()); - rv.push_back('\0'); - return rv; - } - ); - vector argv_charptr_vector; - transform(argv_string_vector.begin(), argv_string_vector.end(), std::back_inserter(argv_charptr_vector), - [](std::vector&cs)->char*{ return &(cs[0]); } - ); - char** argv_char = &(argv_charptr_vector[0]); - - // Do the initialization - int rv = MPI_Init(argc, &argv_char); - - // try and reconstruct how MPI_Init changed the argv - transform(argv_char, argv_char + *argc, stdext::checked_array_iterator(*argv, *argc), - [&recover_wstring](char*pc)->wchar_t* - { - auto it = recover_wstring.find(std::string(pc)); - if (it == recover_wstring.end()) - RuntimeError("Unexpected interaction between MPI_Init and command line parameters"); - return it->second; - } - ); - - // pass through return value from internal call to MPI_Init() - return rv; -} -#endif - -int wmain(int argc, wchar_t* argv[]) -{ - try - { -#ifdef MPI_SUPPORT - { - int rc; - rc = MPI_Init(&argc, &argv); - if (rc != MPI_SUCCESS) - { - MPI_Abort(MPI_COMM_WORLD, rc); - RuntimeError("Failure in MPI_Init: %d", rc); - } - MPI_Comm_size(MPI_COMM_WORLD, &numProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &myRank); - fprintf(stderr, "MPI: RUNNING ON (%s), process %d/%d\n", getenv("COMPUTERNAME"), myRank, numProcs); - fflush(stderr); - } -#else - numProcs = 1; - myRank = 0; -#endif - - ConfigParameters config; - std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config); - - // get the command param set they want - wstring logpath = config("stderr", L""); - ConfigArray command = config("command", "train"); - - if (logpath != L"") - { - for (int i=0; i < command.size(); i++) - { - logpath += L"_"; - logpath += (wstring)command[i]; - } - logpath += L".log"; - if (numProcs > 1) - { - std::wostringstream oss; - oss << myRank; - logpath += L"rank" + oss.str(); - } - RedirectStdErr(logpath); - } - - std::string timestamp = TimeDateStamp(); - - if (myRank == 0) // main process - { - //dump config info - fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str()); - fprintf(stderr, "command line options: \n"); - for (int i = 1; i < argc; i++) - fprintf(stderr, "%s ", WCharToString(argv[i]).c_str()); - - // This simply merges all the different config parameters specified (eg, via config files or via command line directly), - // and prints it. - fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n"); - fprintf(stderr, "%s\n", rawConfigString.c_str()); - fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n"); - - // Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line), - // All of these assignments will appear, even though only the last assignment matters. - fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n"); - fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str()); - fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n"); - - // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last - // value it is set to will appear). - fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n"); - config.dumpWithResolvedVariables(); - fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n"); - - fprintf(stderr, "command: "); - for (int i = 0; i < command.size(); i++) - { - fprintf(stderr, "%s ", command[i].c_str()); - } - } - - //run commands - std::string type = config("precision", "float"); - // accept old precision key for backward compatibility - if (config.Exists("type")) - type = config("type", "float"); - if ( myRank == 0 ) - fprintf(stderr, "\nprecision = %s\n", type.c_str()); - if (type == "float") - DoCommand(config); - else if (type == "double") - DoCommand(config); - else - RuntimeError("invalid precision specified: %s", type.c_str()); - } - catch(const std::exception &err) - { - fprintf(stderr, "EXCEPTION occurred: %s", err.what()); -#ifdef _DEBUG - DebugBreak(); -#endif - return EXIT_FAILURE; - } - catch(...) - { - fprintf(stderr, "Unknown ERROR occurred"); -#ifdef _DEBUG - DebugBreak(); -#endif - return EXIT_FAILURE; - } -#ifdef MPI_SUPPORT - MPI_Finalize(); -#endif - return EXIT_SUCCESS; -} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// cn.cpp : Defines the entry point for the console application. +// + +#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _ + +#include "stdafx.h" +#include "ComputationNetwork.h" +#include "ComputationNode.h" +#include "DataReader.h" +#include "DataWriter.h" +#include "SimpleNetworkBuilder.h" +#include "NDLNetworkBuilder.h" +#include "SynchronousExecutionEngine.h" +#include "ModelEditLanguage.h" +#include "SGD.h" +#include +#include "commandArgUtil.h" +#include "SimpleEvaluator.h" +#include "SimpleOutputWriter.h" +#include +#include +#if defined(_WIN32) +#include "io.h" +#endif +#include "hostname.h" +#include "buildinfo.h" +#ifdef LEAKDETECT +#include "vld.h" // for memory leak detection +#endif +#include +#include "BestGpu.h" + +// MPI builds on windows require the following installed to "c:\program files\Microsoft MPI\" +// HPC Pack 2012 R2 MS-MPI Redistributable Package +// http://www.microsoft.com/en-us/download/details.aspx?id=41634 + +#ifdef MPI_SUPPORT +#include "mpi.h" +#pragma comment(lib, "msmpi.lib") +#endif +int numProcs; +int myRank; + +using namespace std; +using namespace Microsoft::MSR::CNTK; + +// internal test routine forward declaration +template +void TestCn(const ConfigParameters& config); + +void RedirectStdErr(wstring logpath) +{ + fprintf (stderr, "Redirecting stderr to file %S\n", logpath.c_str()); + msra::files::make_intermediate_dirs (logpath); + auto_file_ptr f (logpath.c_str(), "wb"); + if (dup2 (fileno (f), 2) == -1) + RuntimeError ("unexpected failure to redirect stderr to log file"); + setvbuf (stderr, NULL, _IONBF, 16384); // unbuffer it +} + +std::string WCharToString(const wchar_t* wst) +{ + std::wstring ws(wst); + std::string s(ws.begin(), ws.end()); + s.assign(ws.begin(), ws.end()); + return s; +} + +template +void DumpNodeInfo(const ConfigParameters& config) +{ + wstring modelPath = config("modelPath"); + wstring nodeName = config("nodeName",L"__AllNodes__"); + wstring defOutFilePath = modelPath + L"." + nodeName + L".txt"; + wstring outputFile = config("outputFile", WCharToString(defOutFilePath.c_str()).c_str()); + bool printValues = config("printValues", "true"); + + ComputationNetwork net(-1); //always use CPU + net.LoadFromFile(modelPath); + net.DumpNodeInfoToFile(nodeName, printValues, outputFile); +} + +template +void DoEvalBase(const ConfigParameters& config, IDataReader& reader) +{ + DEVICEID_TYPE deviceId = DeviceFromConfig(config); + ConfigArray minibatchSize = config("minibatchSize", "40960"); + size_t epochSize = config("epochSize", "0"); + if (epochSize == 0) + { + epochSize = requestDataSize; + } + wstring modelPath = config("modelPath"); + intargvector mbSize = minibatchSize; + + int traceLevel = config("traceLevel", "0"); + size_t numMBsToShowResult = config("numMBsToShowResult", "100"); + + ConfigArray evalNodeNames = config("evalNodeNames",""); + vector evalNodeNamesVector; + for (int i=0; i < evalNodeNames.size(); ++i) + { + evalNodeNamesVector.push_back(evalNodeNames[i]); + } + + ComputationNetwork net(deviceId); + net.LoadFromFile(modelPath); + net.ResetEvalTimeStamp(); + + SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); + eval.Evaluate(reader, evalNodeNamesVector, mbSize[0], epochSize); +} + +template +void DoEval(const ConfigParameters& config) +{ + //test + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + + DataReader testDataReader(readerConfig); + + DoEvalBase(config, testDataReader); +} + +template +void DoEvalUnroll(const ConfigParameters& config) +{ + //test + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + + DataReader testDataReader(readerConfig); + + DEVICEID_TYPE deviceId = DeviceFromConfig(config); + ConfigArray minibatchSize = config("minibatchSize", "40960"); + size_t epochSize = config("epochSize", "0"); + if (epochSize == 0) + { + epochSize = requestDataSize; + } + wstring modelPath = config("modelPath"); + intargvector mbSize = minibatchSize; + wstring path2EvalResults = config("path2EvalResults", L""); + + ComputationNetwork net(deviceId); + net.LoadFromFile(modelPath); + net.ResetEvalTimeStamp(); + + SimpleEvaluator eval(net); + ElemType evalEntropy; + eval.EvaluateUnroll(testDataReader, mbSize[0], evalEntropy, path2EvalResults == L""? nullptr : path2EvalResults.c_str(), epochSize); +} + +template +void DoCrossValidate(const ConfigParameters& config) +{ + //test + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + + DEVICEID_TYPE deviceId = DeviceFromConfig(config); + ConfigArray minibatchSize = config("minibatchSize", "40960"); + size_t epochSize = config("epochSize", "0"); + if (epochSize == 0) + { + epochSize = requestDataSize; + } + wstring modelPath = config("modelPath"); + intargvector mbSize = minibatchSize; + + ConfigArray cvIntervalConfig = config("crossValidationInterval"); + intargvector cvInterval = cvIntervalConfig; + + size_t sleepSecondsBetweenRuns = config("sleepTimeBetweenRuns", "0"); + + int traceLevel = config("traceLevel", "0"); + size_t numMBsToShowResult = config("numMBsToShowResult", "100"); + + ConfigArray evalNodeNames = config("evalNodeNames",""); + vector evalNodeNamesVector; + for (int i=0; i < evalNodeNames.size(); ++i) + { + evalNodeNamesVector.push_back(evalNodeNames[i]); + } + + std::vector> cvErrorResults; + std::vector cvModels; + + DataReader cvDataReader(readerConfig); + + bool finalModelEvaluated = false; + for (size_t i=cvInterval[0]; i<=cvInterval[2]; i+=cvInterval[1]) + { + wstring cvModelPath = msra::strfun::wstrprintf (L"%ls.%lld", modelPath.c_str(), i); + + if (!fexists (cvModelPath)) + { + fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str()); + if (finalModelEvaluated || !fexists (modelPath)) + continue; // file missing + else + { + cvModelPath = modelPath; + finalModelEvaluated = true; + } + } + + cvModels.push_back(cvModelPath); + ComputationNetwork net(deviceId); + net.LoadFromFile(cvModelPath); + net.ResetEvalTimeStamp(); + + SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); + + fprintf(stderr, "model %ls --> \n",cvModelPath.c_str()); + std::vector evalErrors; + evalErrors = eval.Evaluate(cvDataReader, evalNodeNamesVector, mbSize[0], epochSize); + cvErrorResults.push_back(evalErrors); + + ::Sleep(1000*sleepSecondsBetweenRuns); + } + + //find best model + if (cvErrorResults.size() == 0) + throw std::logic_error("No model is evaluated."); + + std::vector minErrors; + std::vector minErrIds; + std::vector evalErrors = cvErrorResults[0]; + for (int i=0; i < evalErrors.size(); ++i) + { + minErrors.push_back(evalErrors[i]); + minErrIds.push_back(0); + } + + for (int i=0; i +void DoWriteOutput(const ConfigParameters& config) +{ + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + readerConfig.Insert("randomize","None"); //we don't want randomization when output results + + DataReader testDataReader(readerConfig); + + DEVICEID_TYPE deviceId = DeviceFromConfig(config); + ConfigArray minibatchSize = config("minibatchSize", "2048"); + wstring modelPath = config("modelPath"); + intargvector mbSize = minibatchSize; + + size_t epochSize = config("epochSize", "0"); + if (epochSize == 0) + { + epochSize = requestDataSize; + } + + ConfigArray outputNodeNames = config("outputNodeNames",""); + vector outputNodeNamesVector; + for (int i=0; i < outputNodeNames.size(); ++i) + { + outputNodeNamesVector.push_back(outputNodeNames[i]); + } + + ComputationNetwork net(deviceId); + net.LoadFromFile(modelPath); + net.ResetEvalTimeStamp(); + + SimpleOutputWriter writer(net, 1); + + if (config.Exists("writer")) + { + ConfigParameters writerConfig (config("writer")); + bool bWriterUnittest = writerConfig("unittest","false"); + DataWriter testDataWriter(writerConfig); + writer.WriteOutput(testDataReader,mbSize[0], testDataWriter, outputNodeNamesVector, epochSize, bWriterUnittest); + } + else if (config.Exists("outputPath")) + { + wstring outputPath = config("outputPath"); // crashes if no default given? + writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, epochSize); + } + //writer.WriteOutput(testDataReader, mbSize[0], testDataWriter, outputNodeNamesVector, epochSize); +} + +namespace Microsoft { namespace MSR { namespace CNTK { + +TrainingCriterion ParseTrainingCriterionString(wstring s) +{ + msra::strfun::tolower_ascii(s); + if (s==L"crossentropywithsoftmax") + return TrainingCriterion::CrossEntropyWithSoftmax; + else if (s==L"squareerror") + return TrainingCriterion::SquareError; + else if (s!=L"classcrossentropywithsoftmax") // (twisted logic to keep compiler happy w.r.t. not returning from LogicError) + LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (CrossEntropyWithSoftmax | SquareError | ClassCrossEntropyWithSoftmax)"); + return TrainingCriterion::ClassCrossEntropyWithSoftmax; +} + +EvalCriterion ParseEvalCriterionString(wstring s) +{ + msra::strfun::tolower_ascii(s); + if (s==L"errorprediction") + return EvalCriterion::ErrorPrediction; + else if (s==L"crossentropywithsoftmax") + return EvalCriterion::CrossEntropyWithSoftmax; + else if (s==L"classcrossentropywithsoftmax") + return EvalCriterion::ClassCrossEntropyWithSoftmax; + else if (s!=L"squareerror") + LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (ErrorPrediction | CrossEntropyWithSoftmax | SquareError)"); + return EvalCriterion::SquareError; +} + +}}}; + +template +void DoCreateLabelMap(const ConfigParameters& config) +{ + // this gets the section name we are interested in + std::string section = config("section"); + // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution) + ConfigParameters configSection (config(section)); + ConfigParameters readerConfig (configSection("reader")); + readerConfig.Insert("allowMapCreation","true"); + DEVICEID_TYPE deviceId = CPUDEVICE; + size_t minibatchSize = config("minibatchSize", "2048"); + int traceLevel = config("traceLevel","0"); + std::vector featureNames; + std::vector labelNames; + GetFileConfigNames(readerConfig, featureNames, labelNames); + + // setup minibatch matrices + Matrix featuresMatrix(deviceId); + Matrix labelsMatrix(deviceId); + std::map*> matrices; + matrices[featureNames[0]] = &featuresMatrix; + if (labelNames.size() == 0) + RuntimeError("CreateLabelMap: no labels found to process"); + + // now create the reader and loop through the entire dataset to get all the labels + auto start = std::chrono::system_clock::now(); + for (const std::wstring& labelsName: labelNames) + { + // take the last label file defined (the other one might be input) + matrices[labelsName] = &labelsMatrix; + + // get the label mapping file name + ConfigParameters labelConfig (readerConfig(labelsName)); + std::string labelMappingFile; + if (labelConfig.ExistsCurrent("labelMappingFile")) + labelMappingFile = labelConfig("labelMappingFile"); + else if (readerConfig.ExistsCurrent("labelMappingFile")) + labelMappingFile = labelConfig("labelMappingFile"); + else + RuntimeError("CreateLabelMap: No labelMappingFile defined"); + + if (fexists(labelMappingFile)) + { + fprintf(stderr,"CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str()); + return; + } + fprintf(stderr,"CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str()); + + DataReader dataReader(readerConfig); + + dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize); + int count = 0; + while (dataReader.GetMinibatch(matrices)) + { + Matrix& features = *matrices[featureNames[0]]; + count += features.GetNumCols(); + if (traceLevel > 1) + fprintf(stderr,"."); // progress meter + } + dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize); + + // print the results + if (traceLevel > 0) + fprintf(stderr,"\nread %d labels and produced %s\n", count, labelMappingFile.c_str()); + } + auto end = std::chrono::system_clock::now(); + auto elapsed = end-start; + if (traceLevel > 1) + fprintf(stderr, "%f seconds elapsed\n", (float)(std::chrono::duration_cast(elapsed).count())/1000); +} + + +template +void DoTrain(const ConfigParameters& config) +{ + ConfigParameters configSGD (config("SGD")); + bool makeMode = config("makeMode", "true"); + + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + + IComputationNetBuilder* netBuilder = NULL; + + if (config.Exists("NDLNetworkBuilder")) + { + ConfigParameters configNDL (config("NDLNetworkBuilder")); + netBuilder = (IComputationNetBuilder*)new NDLBuilder(configNDL); + } + else if (config.Exists("SimpleNetworkBuilder")) + { + ConfigParameters configSNB (config("SimpleNetworkBuilder")); + netBuilder = (IComputationNetBuilder*)new SimpleNetworkBuilder(configSNB); + } + else + { + RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified" ); + } + + DataReader* dataReader = new DataReader(readerConfig); + + DataReader* cvDataReader = nullptr; + ConfigParameters cvReaderConfig (config("cvReader", L"")); + + if (cvReaderConfig.size() != 0) + { + cvReaderConfig.Insert("traceLevel",config("traceLevel","0")); + cvDataReader = new DataReader(cvReaderConfig); + } + + SGD sgd(configSGD); + + sgd.Train(netBuilder, dataReader, cvDataReader, makeMode); + + delete netBuilder; + delete dataReader; + delete cvDataReader; +} + +template +void DoAdapt(const ConfigParameters& config) +{ + DEVICEID_TYPE deviceId = DeviceFromConfig(config); + + ConfigParameters configSGD (config("SGD")); + bool makeMode = config("makeMode", "true"); + + ConfigParameters readerConfig (config("reader")); + readerConfig.Insert("traceLevel",config("traceLevel","0")); + + DataReader* dataReader = new DataReader(readerConfig); + + DataReader* cvDataReader = nullptr; + ConfigParameters cvReaderConfig (config("cvReader", L"")); + + if (cvReaderConfig.size() != 0) + { + cvReaderConfig.Insert("traceLevel",config("traceLevel","0")); + cvDataReader = new DataReader(cvReaderConfig); + } + + wstring origModelFileName = config("origModelFileName", L""); + wstring refNodeName = config("refNodeName", L""); + + SGD sgd(configSGD); + + sgd.Adapt(origModelFileName, refNodeName, dataReader, cvDataReader, deviceId, makeMode); + + delete dataReader; + delete cvDataReader; +} + +template +void DoEdit(const ConfigParameters& config) +{ + wstring editPath = config("editPath"); + wstring ndlMacros = config("ndlMacros",""); + NDLScript ndlScript; + if (!ndlMacros.empty()) + ndlScript.LoadConfigFile(ndlMacros); + MELScript melScript; + melScript.LoadConfigFileAndResolveVariables(editPath, config); +} + +template +void DoConvertFromDbn(const ConfigParameters& config) +{ + //config.Insert("deviceId","-1"); //force using CPU + + wstring modelPath = config("modelPath"); + wstring dbnModelPath = config("dbnModelPath"); + + IComputationNetBuilder* netBuilder = (IComputationNetBuilder*)new SimpleNetworkBuilder(config); + ComputationNetwork& net = netBuilder->LoadNetworkFromFile(dbnModelPath); + net.SaveToFile(modelPath); + delete (netBuilder); +} +// process the command +template +void DoCommand(const ConfigParameters& config) +{ + ConfigArray command = config("command", "train"); + for (int i=0; i < command.size(); i++) + { + //get the configuration parameters that match the command + ConfigParameters commandParams (config(command[i])); + ConfigArray action = commandParams("action","train"); + + // determine the action to perform, and do it + for (int j=0; j < action.size(); j++) + { + if (action[j] == "train" || action[j] == "trainRNN") + DoTrain(commandParams); + else if (action[j] == "adapt") + DoAdapt(commandParams); + else if (action[j] == "test" || action[j] == "eval") + DoEval(commandParams); + else if (action[j] == "testunroll") + DoEvalUnroll(commandParams); + else if (action[j] == "edit") + DoEdit(commandParams); + else if (action[j] == "cv") + DoCrossValidate(commandParams); + else if (action[j] == "write") + DoWriteOutput(commandParams); + else if (action[j] == "devtest") + TestCn(config); // for "devtest" action pass the root config instead + else if (action[j] == "dumpnode") + DumpNodeInfo(commandParams); + else if (action[j] == "convertdbn") + DoConvertFromDbn(commandParams); + else if (action[j] == "createLabelMap") + DoCreateLabelMap(commandParams); + else + RuntimeError("unknown action: %s in command set: %s", action[j].c_str(), command[i].c_str()); + + NDLScript ndlScript; + ndlScript.ClearGlobal(); // clear global macros between commands + } + } +} + +std::string TimeDateStamp() +{ +#if 0 // "safe" version for Windows, not needed it seems + __time64_t localtime; + + _time64 (&localtime);// get current time and date + struct tm now; + _localtime64_s (&now, &localtime); // convert +#else + time_t t = time(NULL); + struct tm now = *localtime(&t); +#endif + char buf[30]; + sprintf (buf, "%04d/%02d/%02d %02d:%02d:%02d", now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec); + return buf; +} + +#ifdef MPI_SUPPORT +// Oh, my gosh, this is going to be ugly. MPI_INIT needs a char* argv[], so let's interface. +int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv) +{ + // this maps from the strings + std::map recover_wstring; + + // do the mapping to 8-bit encoding for MPI_Init() + vector> argv_string_vector; + transform(*argv, *argv + *argc, std::back_inserter(argv_string_vector), + [&recover_wstring](wchar_t*pws)->vector + { + std::string tmp = msra::strfun::utf8(std::wstring(pws)); + recover_wstring[tmp] = pws; + vector rv(tmp.begin(), tmp.end()); + rv.push_back('\0'); + return rv; + } + ); + vector argv_charptr_vector; + transform(argv_string_vector.begin(), argv_string_vector.end(), std::back_inserter(argv_charptr_vector), + [](std::vector&cs)->char*{ return &(cs[0]); } + ); + char** argv_char = &(argv_charptr_vector[0]); + + // Do the initialization + int rv = MPI_Init(argc, &argv_char); + + // try and reconstruct how MPI_Init changed the argv + transform(argv_char, argv_char + *argc, stdext::checked_array_iterator(*argv, *argc), + [&recover_wstring](char*pc)->wchar_t* + { + auto it = recover_wstring.find(std::string(pc)); + if (it == recover_wstring.end()) + RuntimeError("Unexpected interaction between MPI_Init and command line parameters"); + return it->second; + } + ); + + // pass through return value from internal call to MPI_Init() + return rv; +} +#endif + +void PrintBuiltInfo() +{ + fprintf(stderr, "-------------------------------------------------------------------\n"); + fprintf(stderr, "Build info: \n\n"); + fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__); + fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__); + fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_); + fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_); +#ifdef _GIT_EXIST + fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_); + fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_); +#endif + fprintf(stderr, "-------------------------------------------------------------------\n"); + +} + + +int wmain(int argc, wchar_t* argv[]) +{ + + try + { +#ifdef MPI_SUPPORT + { + int rc; + rc = MPI_Init(&argc, &argv); + if (rc != MPI_SUCCESS) + { + MPI_Abort(MPI_COMM_WORLD, rc); + RuntimeError("Failure in MPI_Init: %d", rc); + } + MPI_Comm_size(MPI_COMM_WORLD, &numProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + fprintf(stderr, "MPI: RUNNING ON (%s), process %d/%d\n", getenv("COMPUTERNAME"), myRank, numProcs); + fflush(stderr); + } +#else + numProcs = 1; + myRank = 0; +#endif + + ConfigParameters config; + std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config); + + // get the command param set they want + wstring logpath = config("stderr", L""); + // [1/26/2015 erw, add done file so that it can be used on HPC] + wstring DoneFile = config("DoneFile", L""); + ConfigArray command = config("command", "train"); + + if (logpath != L"") + { + for (int i=0; i < command.size(); i++) + { + logpath += L"_"; + logpath += (wstring)command[i]; + } + logpath += L".log"; + if (numProcs > 1) + { + std::wostringstream oss; + oss << myRank; + logpath += L"rank" + oss.str(); + } + + RedirectStdErr(logpath); + } + + + PrintBuiltInfo(); + + + std::string timestamp = TimeDateStamp(); + + if (myRank == 0) // main process + { + //dump config info + fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str()); + fprintf(stderr, "command line options: \n"); + for (int i = 1; i < argc; i++) + fprintf(stderr, "%s ", WCharToString(argv[i]).c_str()); + + // This simply merges all the different config parameters specified (eg, via config files or via command line directly), + // and prints it. + fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n"); + fprintf(stderr, "%s\n", rawConfigString.c_str()); + fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n"); + + // Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line), + // All of these assignments will appear, even though only the last assignment matters. + fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n"); + fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str()); + fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n"); + + // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last + // value it is set to will appear). + fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n"); + config.dumpWithResolvedVariables(); + fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n"); + + fprintf(stderr, "command: "); + for (int i = 0; i < command.size(); i++) + { + fprintf(stderr, "%s ", command[i].c_str()); + } + } + + //run commands + std::string type = config("precision", "float"); + // accept old precision key for backward compatibility + if (config.Exists("type")) + type = config("type", "float"); + if ( myRank == 0 ) + fprintf(stderr, "\nprecision = %s\n", type.c_str()); + if (type == "float") + DoCommand(config); + else if (type == "double") + DoCommand(config); + else + RuntimeError("invalid precision specified: %s", type.c_str()); + + // still here , write a DoneFile if necessary + if (!DoneFile.empty()){ + FILE* fp = fopenOrDie(DoneFile.c_str(), L"w"); + fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(),GetHostName().c_str()); + fcloseOrDie(fp); + } + } + catch (const std::exception &err) + { + fprintf(stderr, "EXCEPTION occurred: %s", err.what()); +#ifdef _DEBUG + DebugBreak(); +#endif + return EXIT_FAILURE; + } + catch(...) + { + fprintf(stderr, "Unknown ERROR occurred"); +#ifdef _DEBUG + DebugBreak(); +#endif + return EXIT_FAILURE; + } +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + return EXIT_SUCCESS; +} diff --git a/MachineLearning/cn/cn.vcxproj b/MachineLearning/cn/cn.vcxproj index 4f301d7aa..090f87a0b 100644 --- a/MachineLearning/cn/cn.vcxproj +++ b/MachineLearning/cn/cn.vcxproj @@ -139,6 +139,9 @@ true Copy content files to target directory + + prebuild.bat + @@ -199,6 +202,9 @@ + + prebuild.bat + @@ -216,6 +222,7 @@ + @@ -249,6 +256,7 @@ NotUsing + diff --git a/MachineLearning/cn/cn.vcxproj.filters b/MachineLearning/cn/cn.vcxproj.filters index 65605bc86..0d071e72a 100644 --- a/MachineLearning/cn/cn.vcxproj.filters +++ b/MachineLearning/cn/cn.vcxproj.filters @@ -43,6 +43,9 @@ Network + + Common + @@ -138,6 +141,9 @@ Common\Include + + Common\Include + diff --git a/MachineLearning/cn/prebuild.bat b/MachineLearning/cn/prebuild.bat new file mode 100644 index 000000000..8759732a9 --- /dev/null +++ b/MachineLearning/cn/prebuild.bat @@ -0,0 +1,30 @@ +@echo off + + +echo #ifndef _BUILDINFO_H > buildinfo.h +echo #define _BUILDINFO_H >> buildinfo.h + + +FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i +:: assuming hostname always exists + +:: not sure whether git in path ? +git --version 2 > nul +if not %ERRORLEVEL% == 9909 ( + echo #define _GIT_EXIST >> buildinfo.h + FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i + FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i + echo #define _BUILDBRANCH_ "%BRANCH%" >> buildinfo.h + echo #define _BUILDSHA1_ "%COMMIT%" >> buildinfo.h +) + + +echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h +echo #define _BUILDMACHINE_ "%HOST%" >> buildinfo.h + +set a=%~dp0 +set buildpath="%a:\=\\%" +echo #define _BUILDPATH_ %buildpath% >> buildinfo.h + + +echo #endif >> buildinfo.h diff --git a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp index 65837721b..ecd4e2f48 100644 --- a/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp +++ b/Math/CNTKMathTest/MatrixSparseDenseInteractionsTests.cpp @@ -1,217 +1,264 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#include "stdafx.h" -#include "CppUnitTest.h" -#include "..\Math\Matrix.h" - -#pragma warning (disable: 4244 4245 4305) // conversions and truncations; we don't care in this test project - -#define epsilon 0.000001 -#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing - -using namespace Microsoft::MSR::CNTK; -using namespace Microsoft::VisualStudio::CppUnitTestFramework; - - -namespace CNTKMathTest -{ - TEST_CLASS(MatrixUnitTest) - { - - public: - - //This test should fail if you don't have CUDA GPU (or working under remote desktop) - TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple) - { - Matrix A; - A.AssignTruncateBottomOf(Matrix::RandomUniform(4096,2048,-3,0.1,0),0); - long n0 = A.MatrixNorm0(); - Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); - A.SwitchToMatrixType(MatrixType::SPARSE); - Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType()); - long n1 = A.MatrixNorm0(); - Assert::AreEqual(n0,n1); - A.SwitchToMatrixType(MatrixType::DENSE); - Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); - } - - TEST_METHOD(MatrixSparseTimesDense) - { - Matrix Ad; //DENSE - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE - Matrix As(Ad);//DENSE - As.SwitchToMatrixType(MatrixType::SPARSE); //!!! MATRIX As becomes sparse - Matrix B = Matrix::RandomGaussian(2048,128,1,4); //DENSE - Matrix C = Matrix::RandomGaussian(4096,128,1,2); //DENSE - Matrix C1(C); //DENSE - - float alpha = 0.3, beta = 2; - bool transposeA=false, transposeB=false; - Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE - Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE - Assert::IsTrue(C1.IsEqualTo(C,0.00001)); - } - - TEST_METHOD(MatrixDenseTimesSparse) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - Matrix As(Ad); - As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC); - - Matrix B = Matrix::RandomGaussian(2048,1024,1,4); - Matrix C = Matrix::RandomGaussian(2048,2048,1,2); - Matrix C1(C); - - float alpha = 0.3, beta = 0; - bool transposeA=false, transposeB=false; - Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C); - Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1); - Assert::IsTrue(C1.IsEqualTo(C,0.0001)); - - alpha = 3.3, beta = 1.3; - Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C); - Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1); - Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision - } - - TEST_METHOD(MatrixSparseTimesSparse) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - Matrix As(Ad); - - Matrix Bd; - Bd.AssignTruncateBottomOf(Matrix::RandomUniform(2048,1024,-5,0.4,0),0); - Matrix Bs(Bd); - - Matrix Cd; - Cd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,1024,-4,0.2,0),0); - Matrix Cs(Cd); - - float alpha = 2.4, beta=0; - bool transposeA = false, transposeB=false; - Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd); - - As.SwitchToMatrixType(MatrixType::SPARSE); - Bs.SwitchToMatrixType(MatrixType::SPARSE); - Cs.SwitchToMatrixType(MatrixType::SPARSE); - - Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs); - Cs.SwitchToMatrixType(MatrixType::DENSE); - Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); - - - alpha = 2.4, beta=3.4; - Cs.SwitchToMatrixType(MatrixType::SPARSE); - Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd); - - As.SwitchToMatrixType(MatrixType::SPARSE); - Bs.SwitchToMatrixType(MatrixType::SPARSE); - Cs.SwitchToMatrixType(MatrixType::SPARSE); - - Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs); - Cs.SwitchToMatrixType(MatrixType::DENSE); - Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); - } - - TEST_METHOD(MatrixSparsePlusSparse) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - Matrix As(Ad); - - Matrix Bd; - Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); - Matrix Bs(Bd); - - float alpha = 1.0*rand() / RAND_MAX; - Matrix::ScaleAndAdd(alpha,Ad,Bd); - - As.SwitchToMatrixType(MatrixType::SPARSE); - Bs.SwitchToMatrixType(MatrixType::SPARSE); - Matrix::ScaleAndAdd(alpha,As,Bs); - - Bs.SwitchToMatrixType(MatrixType::DENSE); - Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001)); - } - - TEST_METHOD(MatrixDensePlusSparse) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - - Matrix Bd; - Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); - Matrix Bs(Bd); - - float alpha = 1.0*rand() / RAND_MAX; - Matrix::ScaleAndAdd(alpha,Ad,Bd); - - Bs.SwitchToMatrixType(MatrixType::SPARSE); - Matrix::ScaleAndAdd(alpha,Ad,Bs); - - Bs.SwitchToMatrixType(MatrixType::DENSE); - Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001)); - } - - TEST_METHOD(MatrixSparsePlusDense) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - Matrix As(Ad); - - Matrix Bd; - Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); - Matrix Bd1(Bd); - - float alpha = 1.0*rand() / RAND_MAX; - Matrix::ScaleAndAdd(alpha,Ad,Bd); - - As.SwitchToMatrixType(MatrixType::SPARSE); - Matrix::ScaleAndAdd(alpha,As,Bd1); - - Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001)); - } - - TEST_METHOD(MatrixSparseElementWisePower) - { - Matrix Ad; - Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); - Matrix As(Ad); - As.SwitchToMatrixType(MatrixType::SPARSE); - - Matrix Bd; - Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); - Matrix Bs(Bd); - Bs.SwitchToMatrixType(MatrixType::SPARSE); - - Ad^=2.3; //DENSE - As^=2.3; //SPARSE - Assert::IsTrue(As.IsEqualTo(Ad,0.00001)); - Assert::IsTrue(Ad.IsEqualTo(As,0.00001)); - - Bd.AssignElementPowerOf(Ad,3.2); - Bs.AssignElementPowerOf(As,3.2); -#ifdef CHECK - Bs.SwitchToMatrixType(DENSE); - Bd.TransferFromDeviceToDevice(0,CPUDEVICE); - Bs.TransferFromDeviceToDevice(0,CPUDEVICE); - for (int r = 0; r < Bd.GetNumRows(); ++r) - for (int c = 0; c < Bd.GetNumCols(); ++c) - { - float dVal = Bd(r,c); - float sVal = Bs(r,c); - float diff = sVal - dVal; - if (fabs(diff) > 0.001) - cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal; - } -#endif - Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001)); - Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001)); - } - }; +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#include "stdafx.h" +#include "CppUnitTest.h" +#include "..\Math\Matrix.h" + +#pragma warning (disable: 4244 4245 4305) // conversions and truncations; we don't care in this test project + +#define epsilon 0.000001 +#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing + +using namespace Microsoft::MSR::CNTK; +using namespace Microsoft::VisualStudio::CppUnitTestFramework; + + +namespace CNTKMathTest +{ + TEST_CLASS(MatrixUnitTest) + { + + public: + + //This test should fail if you don't have CUDA GPU (or working under remote desktop) + TEST_METHOD(MatrixChangeModesBetweenDenseAndSparseTests_Simple) + { + Matrix A; + A.AssignTruncateBottomOf(Matrix::RandomUniform(4096,2048,-3,0.1,0),0); + long n0 = A.MatrixNorm0(); + Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); + A.SwitchToMatrixType(MatrixType::SPARSE); + Assert::IsTrue(MatrixType::SPARSE==A.GetMatrixType()); + long n1 = A.MatrixNorm0(); + Assert::AreEqual(n0,n1); + A.SwitchToMatrixType(MatrixType::DENSE); + Assert::IsTrue(MatrixType::DENSE==A.GetMatrixType()); + } + + TEST_METHOD(MatrixSparseTimesDense) + { + Matrix Ad; //DENSE + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(4096,2048,-3,0.1,0),0);//DENSE + Matrix As(Ad);//DENSE + As.SwitchToMatrixType(MatrixType::SPARSE); //!!! MATRIX As becomes sparse + Matrix B = Matrix::RandomGaussian(2048,128,1,4); //DENSE + Matrix C = Matrix::RandomGaussian(4096,128,1,2); //DENSE + Matrix C1(C); //DENSE + + float alpha = 0.3, beta = 2; + bool transposeA=false, transposeB=false; + Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,B,transposeB,beta,C); // DENSE*DENSE + Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,B,transposeB,beta,C1);// SPARSE*DENSE + Assert::IsTrue(C1.IsEqualTo(C,0.00001)); + } + + TEST_METHOD(MatrixDenseTimesSparse) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + Matrix As(Ad); + As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC); + + Matrix B = Matrix::RandomGaussian(2048,1024,1,4); + Matrix C = Matrix::RandomGaussian(2048,2048,1,2); + Matrix C1(C); + + float alpha = 0.3, beta = 0; + bool transposeA=false, transposeB=false; + Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C); + Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1); + Assert::IsTrue(C1.IsEqualTo(C,0.0001)); + + alpha = 3.3, beta = 1.3; + Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,Ad,transposeB,beta,C); + Matrix::MultiplyAndWeightedAdd(alpha,B,transposeA,As,transposeB,beta,C1); + Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision + } + + TEST_METHOD(CPUMatrixDenseTimesSparse) + { + Matrix Ad(CPUDEVICE); + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024, 2048, -3, 0.1, 0), 0); + Matrix As(Ad); + As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC); + + Matrix B = Matrix::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE); + Matrix C = Matrix::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE); + Matrix C1(C); + + float alpha = 0.3, beta = 0; + bool transposeA = false, transposeB = false; + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C); + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1); + Assert::IsTrue(C1.IsEqualTo(C, 0.0001)); + + alpha = 3.3, beta = 1.3; + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C); + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1); + + // TODO IsEqualTo NYI + // Assert::IsTrue(C1.IsEqualTo(C, 0.00005)); + } + + TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse) + { + Matrix Ad(CPUDEVICE); + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(2048, 1024, -3, 0.1, 0), 0); + + Matrix As(Ad); + As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC); + + Matrix B = Matrix::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE); + Matrix AsCsc = Matrix::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE); + Matrix AsBlock(CPUDEVICE); + AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol); + + float alpha = 0.3, beta = 0; + bool transposeA = false, transposeB = true; + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock); + Matrix::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc); + + // TODO IsEqualTo NYI + // Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001)); + } + + TEST_METHOD(MatrixSparseTimesSparse) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + Matrix As(Ad); + + Matrix Bd; + Bd.AssignTruncateBottomOf(Matrix::RandomUniform(2048,1024,-5,0.4,0),0); + Matrix Bs(Bd); + + Matrix Cd; + Cd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,1024,-4,0.2,0),0); + Matrix Cs(Cd); + + float alpha = 2.4, beta=0; + bool transposeA = false, transposeB=false; + Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd); + + As.SwitchToMatrixType(MatrixType::SPARSE); + Bs.SwitchToMatrixType(MatrixType::SPARSE); + Cs.SwitchToMatrixType(MatrixType::SPARSE); + + Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs); + Cs.SwitchToMatrixType(MatrixType::DENSE); + Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); + + + alpha = 2.4, beta=3.4; + Cs.SwitchToMatrixType(MatrixType::SPARSE); + Matrix::MultiplyAndWeightedAdd(alpha,Ad,transposeA,Bd,transposeB,beta,Cd); + + As.SwitchToMatrixType(MatrixType::SPARSE); + Bs.SwitchToMatrixType(MatrixType::SPARSE); + Cs.SwitchToMatrixType(MatrixType::SPARSE); + + Matrix::MultiplyAndWeightedAdd(alpha,As,transposeA,Bs,transposeB,beta,Cs); + Cs.SwitchToMatrixType(MatrixType::DENSE); + Assert::IsTrue(Cs.IsEqualTo(Cd,0.00001)); + } + + TEST_METHOD(MatrixSparsePlusSparse) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + Matrix As(Ad); + + Matrix Bd; + Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); + Matrix Bs(Bd); + + float alpha = 1.0*rand() / RAND_MAX; + Matrix::ScaleAndAdd(alpha,Ad,Bd); + + As.SwitchToMatrixType(MatrixType::SPARSE); + Bs.SwitchToMatrixType(MatrixType::SPARSE); + Matrix::ScaleAndAdd(alpha,As,Bs); + + Bs.SwitchToMatrixType(MatrixType::DENSE); + Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001)); + } + + TEST_METHOD(MatrixDensePlusSparse) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + + Matrix Bd; + Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); + Matrix Bs(Bd); + + float alpha = 1.0*rand() / RAND_MAX; + Matrix::ScaleAndAdd(alpha,Ad,Bd); + + Bs.SwitchToMatrixType(MatrixType::SPARSE); + Matrix::ScaleAndAdd(alpha,Ad,Bs); + + Bs.SwitchToMatrixType(MatrixType::DENSE); + Assert::IsTrue(Bs.IsEqualTo(Bd,0.00001)); + } + + TEST_METHOD(MatrixSparsePlusDense) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + Matrix As(Ad); + + Matrix Bd; + Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); + Matrix Bd1(Bd); + + float alpha = 1.0*rand() / RAND_MAX; + Matrix::ScaleAndAdd(alpha,Ad,Bd); + + As.SwitchToMatrixType(MatrixType::SPARSE); + Matrix::ScaleAndAdd(alpha,As,Bd1); + + Assert::IsTrue(Bd1.IsEqualTo(Bd,0.00001)); + } + + TEST_METHOD(MatrixSparseElementWisePower) + { + Matrix Ad; + Ad.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-3,0.1,0),0); + Matrix As(Ad); + As.SwitchToMatrixType(MatrixType::SPARSE); + + Matrix Bd; + Bd.AssignTruncateBottomOf(Matrix::RandomUniform(1024,2048,-5,0.4,0),0); + Matrix Bs(Bd); + Bs.SwitchToMatrixType(MatrixType::SPARSE); + + Ad^=2.3; //DENSE + As^=2.3; //SPARSE + Assert::IsTrue(As.IsEqualTo(Ad,0.00001)); + Assert::IsTrue(Ad.IsEqualTo(As,0.00001)); + + Bd.AssignElementPowerOf(Ad,3.2); + Bs.AssignElementPowerOf(As,3.2); +#ifdef CHECK + Bs.SwitchToMatrixType(DENSE); + Bd.TransferFromDeviceToDevice(0,CPUDEVICE); + Bs.TransferFromDeviceToDevice(0,CPUDEVICE); + for (int r = 0; r < Bd.GetNumRows(); ++r) + for (int c = 0; c < Bd.GetNumCols(); ++c) + { + float dVal = Bd(r,c); + float sVal = Bs(r,c); + float diff = sVal - dVal; + if (fabs(diff) > 0.001) + cout << "[" << r << ", " << c << "]: " << sVal << " and " << dVal; + } +#endif + Assert::IsTrue(Bs.IsEqualTo(Bd,0.0001)); + Assert::IsTrue(Bd.IsEqualTo(Bs,0.0001)); + } + }; } \ No newline at end of file diff --git a/Math/Math/CPUSparseMatrix.cpp b/Math/Math/CPUSparseMatrix.cpp index 0e2fbeca9..266e3981f 100644 --- a/Math/Math/CPUSparseMatrix.cpp +++ b/Math/Math/CPUSparseMatrix.cpp @@ -1,944 +1,962 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// Math.cpp : Defines the exported functions for the DLL application. -// - -#include "stdafx.h" -#include -#include -#include -#include -#include "CPUMatrix.h" -#include "CPUSparseMatrix.h" -#include -#include -#ifdef _WIN32 -#include -#endif -#ifdef LEAKDETECT -#include -#endif - -#include "basetypes.h" -#include "fileutil.h" - - -#ifndef USE_MKL -// use ACML as default. -// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above -// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ -// Install the ifort64 variant (compiled with intel compiler) of the library -// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml -// to point to your folder for the include file and link library -#include // requires ACML 5.3.0 and above -#else -// requires MKL 10.0 and above -#endif - -// This is an example of an exported variable -//MATH_API int nMath=0; - -// This is an example of an exported function. -//MATH_API int fnMath(void) -//{ -// return 42; -//} - -#ifndef USE_MKL //MKL has one additional parameter for different matrix order -#define BLAS_COLMAJOR -#else -#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, -#endif - -#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);} -#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing -namespace Microsoft { namespace MSR { namespace CNTK { - -#pragma region Helpful Enum Definitions - enum class MatrixOrder - { - RowMajor = 101, // row-major arrays - ColMajor = 102 // column-major arrays - }; - - enum class MatrixTranspose : char - { - NoTrans = 'N', // trans='N' - Trans = 'T', // trans='T' - ConjTrans = 'C' // trans='C' - }; - - enum class SymMatrixType : char - { - Up = 'U', // symmetric matrix is stored in the upper part - Low = 'L', // symmetric matrix is stored in thelower part - Full = 'F', //full populated - NotSymmetric = 'N' //not a symmetric matrix - }; - - enum class MatrixOpSide : char - { - Left = 'L', // left multiply - Right = 'R', // right multiply - }; -#pragma endregion Helpful Enum Definitions - -#pragma region Constructors and Destructor - - //should only be used by constructors. - template - void CPUSparseMatrix::ZeroInit() - { - m_numRows = 0; - m_numCols = 0; - m_elemSizeAllocated = 0; - m_compIndexSize = 0; - m_externalBuffer = false; - m_computeDevice = CPUDEVICE; - m_nz = 0; - m_matrixName = NULL; - - //if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) - { - m_colIdx = -1; - m_pArray = NULL; - m_unCompIndex = NULL; - m_compIndex = NULL; - } - //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - m_blockSize = 0; - m_blockVal = NULL; - m_blockIds = NULL; - } - } - - //should only be used by constructors. - template - void CPUSparseMatrix::CheckInit(const MatrixFormat format) - { - if (format != MatrixFormat::matrixFormatSparseCSC && format != MatrixFormat::matrixFormatSparseCSR && format != MatrixFormat::matrixFormatSparseBlockCol && format != MatrixFormat::matrixFormatSparseBlockRow) - { - throw std::logic_error("CPUSparseMatrix: unsupported sparse matrix format"); - } - m_format = format; - ZeroInit(); - } - - template - CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format) - { - CheckInit(format); - } - - template - CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size) - { - CheckInit(format); - Resize(numRows, numCols, size); - } - - template - CPUSparseMatrix::~CPUSparseMatrix() - { - if (m_matrixName!=NULL) - { - delete[] m_matrixName; - m_matrixName = nullptr; - } - if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) - { - if(m_pArray != NULL) - delete[] m_pArray; - if(m_unCompIndex != NULL) - delete[] m_unCompIndex; - if(m_compIndex != NULL) - delete[] m_compIndex; - } - else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - if(m_blockVal != NULL) - delete[] m_blockVal; - if(m_blockIds != NULL) - delete[] m_blockIds; - } - } - - - -#pragma endregion Constructors and Destructor - -#pragma region Basic Operators - - //make sure call order in colume wise for CSC and row wise for CSR - template - void CPUSparseMatrix::SetValue(const size_t row, const size_t col, const ElemType v) - { - if(m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseCSR) - { - throw std::logic_error("CPUSparseMatrix: unsupported SetValue() call."); - } - - if(m_elemSizeAllocated < m_nz +1) //automatic resize - { - Resize(m_numRows, m_numCols, m_nz + 100); //allocate 100 more elelemnts and keep existing values - } - - if(row < 0 || row >= m_numRows) - { - throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id"); - } - - if(col < 0 || col >= m_numCols) { - throw std::logic_error("CPUSparseMatrix: SetValue() invalid column id"); - } - - size_t r = (m_format == matrixFormatSparseCSC) ? row: col; - size_t c = (m_format == matrixFormatSparseCSC) ? col: row; - - m_pArray[m_nz] = v; - m_unCompIndex[m_nz] = (CPUSPARSE_INDEX_TYPE)r; - - //consistency check - if(c == m_colIdx && r <= m_unCompIndex[m_nz-1]) - { - throw std::logic_error("CPUSparseMatrix: SetValue is not called properly"); - } - - if (c != m_colIdx) - { - m_compIndex[c] = CPUSPARSE_INDEX_TYPE(m_nz); - m_colIdx = (int) c; - } - m_compIndex[c + 1] = CPUSPARSE_INDEX_TYPE(m_nz + 1); - m_nz++; - } - - template - ElemType* CPUSparseMatrix::BufferPointer() const - { - if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) - { - return m_pArray; - } - else - { - return m_blockVal; - } - } - - template - void CPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues) - { - size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1; - bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize); - - m_numRows = numRows; - m_numCols = numCols; - - if (reallocate) - { - if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) - { - ElemType *pArray = new ElemType[numNZElemToReserve]; - CPUSPARSE_INDEX_TYPE *unCompIndex = new CPUSPARSE_INDEX_TYPE[numNZElemToReserve]; - CPUSPARSE_INDEX_TYPE *compIndex = new CPUSPARSE_INDEX_TYPE[newCompIndexSize]; - - if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize)) - throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize"); - - if (keepExistingValues && m_nz > 0) - { - assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve); - memcpy(pArray, m_pArray, NzSize()); - memcpy(unCompIndex, m_unCompIndex, MajorIndexSize()); - memcpy(compIndex, m_compIndex, SecondaryIndexSize()); - } - - if (m_pArray != NULL) - delete[] m_pArray; - if (m_unCompIndex != NULL) - delete[] m_unCompIndex; - if (m_compIndex != NULL) - delete[] m_compIndex; - - m_pArray = pArray; - m_unCompIndex = unCompIndex; - m_compIndex = compIndex; - } - else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - ElemType *blockVal = new ElemType[numNZElemToReserve]; - size_t *blockIds = new size_t[newCompIndexSize]; - - if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize)) - throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize"); - - if (keepExistingValues && m_elemSizeAllocated > 0) - { - assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve); - memcpy(blockVal, m_blockVal, NzSize()); - memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize); - } - - if (m_blockVal != NULL) - delete[] m_blockVal; - if(m_blockIds != NULL) - delete[] m_blockIds; - - m_blockVal = blockVal; - m_blockIds = blockIds; - } - - m_elemSizeAllocated = numNZElemToReserve; - m_compIndexSize = newCompIndexSize; - } - } - - //Reset matrix so it can be reused - template - void CPUSparseMatrix::Reset() - { - m_nz = 0; - m_colIdx = -1; - m_blockSize = 0; - } - - //c = alpha*op(lhs) * op(rhs) + beta*c - template - void CPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, - const CPUSparseMatrix& rhs, const bool transposeB, ElemType beta, CPUMatrix& c) - - { - if (lhs.IsEmpty() || rhs.IsEmpty()) - throw std::logic_error("MultiplyAndWeightedAdd: one of the input matrix is empty."); - - int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows(); - int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols(); - int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows(); - int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols(); - - assert (m>0 && k>0 && l>0 && n>0); //converting from size_t to int may cause overflow - assert (k == l); - if (k != l) - { - throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match."); - } - - if (c.GetNumRows() != m || c.GetNumCols() != n) - { - c.Resize(m,n); - } - - if (beta == 0) - { - memset(c.GetArray(), 0, sizeof(ElemType) * c.GetNumElements()); - } - else if (beta != 1) - { -#pragma omp parallel for - foreach_coord(i,j,c) - { - c(i,j) = beta * c(i,j); - } - } - - if (rhs.GetFormat() != matrixFormatSparseCSC) - NOT_IMPLEMENTED; - - if (!transposeA && !transposeB) - { - for(size_t j = 0; j < rhs.GetNumCols(); j++) - { - size_t start = rhs.m_compIndex[j]; //ColLocation - size_t end = rhs.m_compIndex[j+1]; - for(size_t p = start; p < end; p++) - { - size_t i = rhs.m_unCompIndex[p]; //RowLocation - ElemType val = rhs.m_pArray[p]; - - for(size_t h = 0; h < lhs.GetNumRows(); h++) - { - c(h,j) += alpha * lhs(h, i)*val; - } - } - } - } - else if (!transposeA && transposeB) - { - for(size_t j = 0; j < rhs.GetNumCols(); j++) - { - size_t start = rhs.m_compIndex[j]; - size_t end = rhs.m_compIndex[j + 1]; - - for(size_t p = start; p < end; p++) - { - size_t i = rhs.m_unCompIndex[p]; - ElemType val = rhs.m_pArray[p]; - for(size_t h = 0; h < lhs.GetNumRows(); h++) - { - c(h, i) += alpha * lhs(h, j)*val; - } - } - } - } - else if (transposeA && !transposeB) - { - NOT_IMPLEMENTED; - } - else - { - NOT_IMPLEMENTED; - } - } - - //c = alpha * op(lhs) * op(rhs) - template - void CPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, - const CPUSparseMatrix& rhs, const bool transposeB, CPUSparseMatrix& c) - { - if (lhs.IsEmpty() || rhs.IsEmpty()) - throw std::logic_error("LeftMultiplyAndAdd: one of the input matrix is empty."); - - int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows(); - int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols(); - int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows(); - int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols(); - - assert (m>0 && k>0 && l>0 && n>0); m; n; //converting from size_t to int may cause overflow - assert (k == l); - if (k != l) - { - throw std::invalid_argument("CPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match."); - } - - c.Reset(); - - if (!transposeA && !transposeB) - { - NOT_IMPLEMENTED; - } - else if (!transposeA && transposeB) - { - if (rhs.GetFormat() != matrixFormatSparseCSC) - NOT_IMPLEMENTED; - - //allocate enough memory - c.SetFormat(matrixFormatSparseBlockCol); - c.Resize(m, n, m*min(n, rhs.m_nz)); - - map w2Id; - for(size_t j = 0; j < rhs.GetNumCols(); j++) - { // j ranges over batches - size_t start = rhs.m_compIndex[j]; - size_t end = rhs.m_compIndex[j+1]; - - for(size_t p = start; p < end; p++) - { - size_t i = rhs.m_unCompIndex[p]; //i ranges over words - ElemType val = rhs.m_pArray[p]; //1 for(i, j) - - bool first = true; - if(w2Id.find(i) == w2Id.end()) - { - w2Id[i] = w2Id.size(); - c.m_blockIds[c.m_blockSize]=i; - c.m_blockSize++; - } - else - { - first = false; - } - size_t pos = w2Id[i] * lhs.GetNumRows(); - for(size_t h = 0; h < lhs.GetNumRows(); h++) - { // h range over hidden layer - if(first == true) - { - c.m_blockVal[pos] = alpha*lhs(h, j)*val; - } else - { - c.m_blockVal[pos] += alpha*lhs(h, j)*val; - } - pos++; - } - } - } - c.m_nz = c.m_blockSize * m; - if(c.m_nz > c.GetSizeAllocated()) - { - throw std::logic_error("sparse matrix out of range."); - } - //c.SetFormat(matrixFormatSparseBlockCol); - } - else if (transposeA && !transposeB) - { - NOT_IMPLEMENTED; - } - else - { - NOT_IMPLEMENTED; - } - } - - template - void CPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& rhs) - { - if (lhs.IsEmpty() || rhs.IsEmpty()) - { - throw std::logic_error("ScaleAndAdd: one of the input matrix is empty."); - } - - if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) - { - throw std::invalid_argument("CPUSparseMatrix::ScaleAndAdd: The dimensions of a and b must match."); - } - - if(lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSC || lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSR) - { - size_t col_num = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? lhs.GetNumCols(): lhs.GetNumRows(); - for(size_t j = 0; j < col_num; j++) - { - size_t start = lhs.m_compIndex[j]; - size_t end = lhs.m_compIndex[j + 1]; - for(size_t p = start; p < end; p++) - { - size_t i = lhs.m_unCompIndex[p]; - ElemType val = lhs.m_pArray[p]; - size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j; - size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i; - rhs(r, c) += alpha * val; - } - } - } - else if (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol || lhs.m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - for(size_t j = 0; j < lhs.m_blockSize; j++) - { - size_t i = lhs.m_blockIds[j]; - size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols(); - size_t start = j * len; - for(size_t p = start; p < start+len; p++) - { - ElemType val = lhs.m_blockVal[p]; - - size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; - size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); - rhs(r, c) += alpha * val; - } - } - } - else - { - throw std::runtime_error("CPUSparseMatrix:: ScaleAndAdd() Not implemented"); - } - } - - - // a: H x No: H is hidden layer size and No is mini-batch size - // weight: V x H, V is vocab size - // label: V x No - // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class - // idx2cls: V x 1, mapping from word to class id - // etp: V x No, stores predicted values - template - void CPUSparseMatrix::ClassEntropy(const CPUMatrix& a, const CPUMatrix& weight, - const CPUSparseMatrix & label, const CPUMatrix& cls, - const CPUMatrix& idx2cls, CPUSparseMatrix& etp, CPUMatrix& entropyScore) - { - if (a.IsEmpty() || cls.IsEmpty() || label.IsEmpty() || idx2cls.IsEmpty()) - throw std::logic_error("AssignSoftmaxOf: Matrix a, class, idx2cls or label is empty."); - - if(etp.GetFormat() != MatrixFormat::matrixFormatSparseCSC) - throw std::runtime_error("CPUSparseMatrix:: ClassEntropy() only support CSC"); - - size_t nC = cls.GetNumCols(); - size_t nV = label.GetNumRows() - nC; - - if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows()) - throw std::logic_error("ClassEntropy: check matrix dimension"); - - //allocate enough memory - if(etp.m_elemSizeAllocated < etp.GetNumElements()) - { - etp.Resize(etp.GetNumRows(), etp.GetNumCols(), etp.GetNumElements(), true, false); - } - etp.Reset(); - - entropyScore(0, 0) = 0; - for(size_t j = 0; j < label.GetNumCols(); j++) - { - size_t start = label.m_compIndex[j]; - size_t end = label.m_compIndex[j + 1]; - for (size_t p = start; p < end; p++) - { - size_t i = label.m_unCompIndex[p]; - size_t iStt, iEnd; - if (i < nV) - { - size_t clsid = (size_t)idx2cls(i, 0); - iStt = (size_t) cls(0, clsid); //class start word id - iEnd = (size_t) cls(1, clsid); //class end word id - } - else - { - iStt = nV; - iEnd = nV + nC; - } - - size_t b = etp.m_nz; - for(size_t ii = iStt; ii < iEnd; ii++) //ii ranges over sub-vocab or class ids - { - ElemType val = 0.0; - foreach_row(rw, a) //rw ranges over hidden units - { - val += weight(ii,rw) * a(rw,j); - } - etp.SetValue(ii, j, val); - } - ElemType maxV = LZERO; - for(size_t ii = b; ii < etp.m_nz; ii++) - { - maxV = (ElemType) logadd(maxV, etp.m_pArray[ii]); - } - - for(size_t ii = b; ii < etp.m_nz; ii++) - { - etp.m_pArray[ii] = etp.m_pArray[ii] - maxV; - } - - entropyScore(0, 0) -= etp.m_pArray[b+i-iStt]; - //negate positive data points - etp.m_pArray[b+i-iStt] *=-1; - } - } - } - - - template - void CPUSparseMatrix::ClassEntropyError(CPUSparseMatrix& a) - { - for(int i = 0; i < a.m_nz; i++) - { - if(a.m_pArray[i] < 0) - { - a.m_pArray[i] = exp(a.m_pArray[i]); //negative; - } - else - { - a.m_pArray[i] = exp(-a.m_pArray[i])-1; //positive - } - } - } - - - template - void CPUSparseMatrix::ClassEntropyGradientOfInput( - const CPUSparseMatrix& error, - const CPUMatrix& weight, - CPUMatrix& grd) - { - grd.SetValue(0); - - for(size_t j = 0; j < error.GetNumCols(); j++) - { - size_t start = error.m_compIndex[j]; - size_t end = error.m_compIndex[j+1]; - for(size_t p = start; p < end; p++) - { - size_t i = error.m_unCompIndex[p]; - for(size_t h = 0; h < grd.GetNumRows(); h++) - { // h ranges over hidden units - grd(h,j) += weight(i, h) * error.m_pArray[p]; - } - } - } - } - - - - template - void CPUSparseMatrix::ClassEntropyGradientOfWeight( - const CPUSparseMatrix& error, - const CPUMatrix& input, - const CPUSparseMatrix & /*label*/, - const CPUMatrix& /*cls*/, - const CPUMatrix& /*idx2cls*/, - CPUSparseMatrix& grd) - { - grd.SetFormat(matrixFormatSparseBlockRow); - //allocate enough memory - grd.Resize(grd.GetNumRows(), grd.GetNumCols(), error.m_nz*input.GetNumRows(), true, false); - - grd.Reset(); - map w2Id; - for(size_t j = 0; j < error.GetNumCols(); j++) - { - size_t start = error.m_compIndex[j]; - size_t end = error.m_compIndex[j+1]; - - for(size_t p = start; p < end; p++) - { - size_t i = error.m_unCompIndex[p]; // i ranges over words - bool first = true; - if(w2Id.find(i) == w2Id.end()) - { - w2Id[i] = w2Id.size(); - grd.m_blockIds[grd.m_blockSize]=i; - grd.m_blockSize++; - } - else - { - first = false; - } - size_t pos = w2Id[i]*input.GetNumRows(); - for(size_t h = 0; h < input.GetNumRows(); h++) - { // h range over hidden layer - if(first == true) - { - grd.m_blockVal[pos] = input(h, j)*error.m_pArray[p]; - } - else - { - grd.m_blockVal[pos] += input(h, j)*error.m_pArray[p]; - } - pos++; - } - } - } - grd.m_nz = grd.m_blockSize * input.GetNumRows(); - if(grd.m_nz > grd.GetSizeAllocated()) - { - throw std::logic_error("sparse matrix out of range."); - } - //grd.SetFormat(matrixFormatSparseBlockRow); - } - - // normal update for smoothed gradients c and current gradients (this) - template - void CPUSparseMatrix::NormalGrad(CPUMatrix& c, const ElemType momentum) - { - if (c.IsEmpty()) - { - c.Resize(GetNumRows(), GetNumCols()); - c.SetValue(0.0); - } - - if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - for(size_t j = 0; j < m_blockSize; j++) - { - size_t i = m_blockIds[j]; - size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); - size_t start = j* len; - for(size_t p = start; p < start+len; p++) - { - ElemType val = m_blockVal[p]; - size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; - size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); - c(row, col) = (1-momentum)*val + momentum*c(row, col); - m_blockVal[p] = c(row, col); - } - } - } - else - { - throw std::runtime_error("CPUSparseMatrix:: NormalGrad() only support block sparse format"); - } - } - - // update smoothed gradients c and current gradients (this) - template - void CPUSparseMatrix::Adagrad(CPUMatrix& c) - { - if (c.IsEmpty()) - { - c.Resize(GetNumRows(), GetNumCols()); - c.SetValue(0.0); - } - - const ElemType floor = 1e-16f; - if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) - { - size_t col_num = (m_format == MatrixFormat::matrixFormatSparseCSC) ? GetNumCols() : GetNumRows(); - for(size_t j = 0; j < col_num; j++) - { - size_t start = m_compIndex[j]; - size_t end = m_compIndex[j+1]; - for(size_t p = start; p < end; p++) - { - size_t i = m_unCompIndex[p]; - ElemType val = m_pArray[p]; - - size_t row = (m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j; - size_t col = (m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i; - ElemType adenorm = c(row, col); - adenorm += val * val; - val = val / (floor + sqrt(adenorm)); - m_pArray[p] = val; - c(row, col) = adenorm; - } - } - } else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - for(size_t j = 0; j < m_blockSize; j++) - { - size_t i = m_blockIds[j]; - size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); - size_t start = j* len; - for(size_t p = start; p < start+len; p++) - { - ElemType val = m_blockVal[p]; - - size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; - size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); - ElemType adenorm = c(row, col); - adenorm += val * val; - val = val / (floor + sqrt(adenorm)); - m_blockVal[p] = val; - c(row, col) = adenorm; - } - } - } - } - - template - CPUSparseMatrix& CPUSparseMatrix::InplaceTruncate (const ElemType threshold) - { - if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - ElemType locThresholdPos = abs(threshold); - ElemType locTHresholdNeg = -locThresholdPos; - - for(size_t j = 0; j < m_blockSize; j++) - { - size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); - size_t start = j* len; - for (size_t p = start; p < start+len; p++) - { - if (m_blockVal[p] > locThresholdPos) - { - m_blockVal[p] = locThresholdPos; - } - else if (m_blockVal[p] < locTHresholdNeg) - { - m_blockVal[p] = locTHresholdNeg; - } - } - } - } - else - { - throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix"); - } - return *this; - } - - template - MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) - { - stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); - size_t elsize; - stream >> elsize; - if (sizeof(ElemType) != elsize) - throw std::runtime_error("Template argument size doesn't match those in file"); - std::wstring matrixName; - - // now prepare this header to receive the data being read - size_t nz, colnum, rownum; - int format; - - // read in the header information - stream >> matrixName >> format >> nz >> colnum >> rownum; - - us.SetFormat((MatrixFormat)format); - if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR) - NOT_IMPLEMENTED; - - us.Resize(rownum, colnum, nz); - - if (nz > 0) - { - size_t compressedSize = (us.GetFormat() == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1; - ElemType* dataBuffer = us.NzValues(); - CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation(); - CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation(); - - // read in the sparse matrix info - for (size_t i = 0; i < nz; ++i) - { - stream >> dataBuffer[i]; - } - for (size_t i = 0; i < nz; ++i) - { - stream >> unCompressedIndex[i]; - } - for (size_t i = 0; i < compressedSize; ++i) - { - stream >> compressedIndex[i]; - } - } - stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); - - us.SetMatrixName(matrixName.c_str()); - - return stream; - } - - template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); - template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); - - template - MATH_API File& operator<<(File& stream, const CPUSparseMatrix& us) - { - if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR) - NOT_IMPLEMENTED; - - stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); - stream << sizeof(ElemType); - if (us.GetMatrixName() == nullptr) - { - std::wstring s(L"nnmatrix"); - stream << s; - } - else - { - stream << us.GetMatrixName(); - } - - size_t nz, numRows, numCols; - size_t compressedSize = us.SecondaryIndexCount(); - int format = us.GetFormat(); - - stream << format << nz << numCols << numRows; - - if (nz > 0) - { - ElemType* dataBuffer = us.NzValues(); - CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation(); - CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation(); - - for (size_t i = 0; i < nz; ++i) - { - stream << dataBuffer[i]; - } - for (size_t i = 0; i < nz; ++i) - { - stream << unCompressedIndex[i]; - } - for (size_t i = 0; i < compressedSize; ++i) - { - stream << compressedIndex[i]; - } - } - stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT")); - - return stream; - } - - template class CPUSparseMatrix; - template class CPUSparseMatrix; - -}}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// Math.cpp : Defines the exported functions for the DLL application. +// + +#include "stdafx.h" +#include +#include +#include +#include +#include "CPUMatrix.h" +#include "CPUSparseMatrix.h" +#include +#include +#ifdef _WIN32 +#include +#endif +#ifdef LEAKDETECT +#include +#endif + +#include "basetypes.h" +#include "fileutil.h" + + +#ifndef USE_MKL +// use ACML as default. +// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above +// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ +// Install the ifort64 variant (compiled with intel compiler) of the library +// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml +// to point to your folder for the include file and link library +#include // requires ACML 5.3.0 and above +#else +// requires MKL 10.0 and above +#endif + +// This is an example of an exported variable +//MATH_API int nMath=0; + +// This is an example of an exported function. +//MATH_API int fnMath(void) +//{ +// return 42; +//} + +#ifndef USE_MKL //MKL has one additional parameter for different matrix order +#define BLAS_COLMAJOR +#else +#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, +#endif + +#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);} +#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing +namespace Microsoft { namespace MSR { namespace CNTK { + +#pragma region Helpful Enum Definitions + enum class MatrixOrder + { + RowMajor = 101, // row-major arrays + ColMajor = 102 // column-major arrays + }; + + enum class MatrixTranspose : char + { + NoTrans = 'N', // trans='N' + Trans = 'T', // trans='T' + ConjTrans = 'C' // trans='C' + }; + + enum class SymMatrixType : char + { + Up = 'U', // symmetric matrix is stored in the upper part + Low = 'L', // symmetric matrix is stored in thelower part + Full = 'F', //full populated + NotSymmetric = 'N' //not a symmetric matrix + }; + + enum class MatrixOpSide : char + { + Left = 'L', // left multiply + Right = 'R', // right multiply + }; +#pragma endregion Helpful Enum Definitions + +#pragma region Constructors and Destructor + + //should only be used by constructors. + template + void CPUSparseMatrix::ZeroInit() + { + m_numRows = 0; + m_numCols = 0; + m_elemSizeAllocated = 0; + m_compIndexSize = 0; + m_externalBuffer = false; + m_computeDevice = CPUDEVICE; + m_nz = 0; + m_matrixName = NULL; + + //if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) + { + m_colIdx = -1; + m_pArray = NULL; + m_unCompIndex = NULL; + m_compIndex = NULL; + } + //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + m_blockSize = 0; + m_pArray = NULL; + m_blockIds = NULL; + } + } + + //should only be used by constructors. + template + void CPUSparseMatrix::CheckInit(const MatrixFormat format) + { + if (format != MatrixFormat::matrixFormatSparseCSC && format != MatrixFormat::matrixFormatSparseCSR && format != MatrixFormat::matrixFormatSparseBlockCol && format != MatrixFormat::matrixFormatSparseBlockRow) + { + throw std::logic_error("CPUSparseMatrix: unsupported sparse matrix format"); + } + m_format = format; + m_default = defaultElem(); + ZeroInit(); + } + + template + CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format) + { + CheckInit(format); + } + + template + CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size) + { + CheckInit(format); + Resize(numRows, numCols, size); + } + + template + CPUSparseMatrix::~CPUSparseMatrix() + { + if (m_matrixName!=NULL) + { + delete[] m_matrixName; + m_matrixName = nullptr; + } + if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) + { + if(m_pArray != NULL) + delete[] m_pArray; + if(m_unCompIndex != NULL) + delete[] m_unCompIndex; + if(m_compIndex != NULL) + delete[] m_compIndex; + } + else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + if (m_pArray != NULL) + delete[] m_pArray; + if(m_blockIds != NULL) + delete[] m_blockIds; + } + } + + + +#pragma endregion Constructors and Destructor + +#pragma region Basic Operators + + //make sure call order in colume wise for CSC and row wise for CSR + template + void CPUSparseMatrix::SetValue(const size_t row, const size_t col, const ElemType v) + { + if(m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseCSR) + { + throw std::logic_error("CPUSparseMatrix: unsupported SetValue() call."); + } + + if(m_elemSizeAllocated < m_nz +1) //automatic resize + { + Resize(m_numRows, m_numCols, m_nz + 100); //allocate 100 more elelemnts and keep existing values + } + + if(row < 0 || row >= m_numRows) + { + throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id"); + } + + if(col < 0 || col >= m_numCols) { + throw std::logic_error("CPUSparseMatrix: SetValue() invalid column id"); + } + + size_t r = (m_format == matrixFormatSparseCSC) ? row: col; + size_t c = (m_format == matrixFormatSparseCSC) ? col: row; + + m_pArray[m_nz] = v; + m_unCompIndex[m_nz] = (CPUSPARSE_INDEX_TYPE)r; + + //consistency check + if(c == m_colIdx && r <= m_unCompIndex[m_nz-1]) + { + throw std::logic_error("CPUSparseMatrix: SetValue is not called properly"); + } + + if (c != m_colIdx) + { + m_compIndex[c] = CPUSPARSE_INDEX_TYPE(m_nz); + m_colIdx = (int) c; + } + m_compIndex[c + 1] = CPUSPARSE_INDEX_TYPE(m_nz + 1); + m_nz++; + } + + template + ElemType* CPUSparseMatrix::BufferPointer() const + { + return m_pArray; + } + + template + void CPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues) + { + size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1; + bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize); + + m_numRows = numRows; + m_numCols = numCols; + + if (reallocate) + { + if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) + { + ElemType *pArray = new ElemType[numNZElemToReserve]; + CPUSPARSE_INDEX_TYPE *unCompIndex = new CPUSPARSE_INDEX_TYPE[numNZElemToReserve]; + CPUSPARSE_INDEX_TYPE *compIndex = new CPUSPARSE_INDEX_TYPE[newCompIndexSize]; + + if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize)) + throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize"); + + if (keepExistingValues && m_nz > 0) + { + assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve); + memcpy(pArray, m_pArray, NzSize()); + memcpy(unCompIndex, m_unCompIndex, MajorIndexSize()); + memcpy(compIndex, m_compIndex, SecondaryIndexSize()); + } + + if (m_pArray != NULL) + delete[] m_pArray; + if (m_unCompIndex != NULL) + delete[] m_unCompIndex; + if (m_compIndex != NULL) + delete[] m_compIndex; + + m_pArray = pArray; + m_unCompIndex = unCompIndex; + m_compIndex = compIndex; + } + else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + ElemType *blockVal = new ElemType[numNZElemToReserve]; + size_t *blockIds = new size_t[newCompIndexSize]; + + if (keepExistingValues && (m_nz > numNZElemToReserve || m_compIndexSize > newCompIndexSize)) + throw std::logic_error("Resize: To keep values m_nz should <= numNZElemToReserve and m_compIndexSize <= newCompIndexSize"); + + if (keepExistingValues && m_elemSizeAllocated > 0) + { + assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve); + memcpy(blockVal, m_pArray, NzSize()); + memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize); + } + + if (m_pArray != NULL) + delete[] m_pArray; + if(m_blockIds != NULL) + delete[] m_blockIds; + + m_pArray = blockVal; + m_blockIds = blockIds; + } + + m_elemSizeAllocated = numNZElemToReserve; + m_compIndexSize = newCompIndexSize; + } + } + + //Reset matrix so it can be reused + template + void CPUSparseMatrix::Reset() + { + m_nz = 0; + m_colIdx = -1; + m_blockSize = 0; + } + + //c = alpha*op(lhs) * op(rhs) + beta*c + template + void CPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, + const CPUSparseMatrix& rhs, const bool transposeB, ElemType beta, CPUMatrix& c) + + { + if (lhs.IsEmpty() || rhs.IsEmpty()) + throw std::logic_error("MultiplyAndWeightedAdd: one of the input matrix is empty."); + + int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows(); + int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols(); + int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows(); + int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols(); + + assert (m>0 && k>0 && l>0 && n>0); //converting from size_t to int may cause overflow + assert (k == l); + if (k != l) + { + throw std::invalid_argument("CPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match."); + } + + if (c.GetNumRows() != m || c.GetNumCols() != n) + { + c.Resize(m,n); + } + + if (beta == 0) + { + memset(c.GetArray(), 0, sizeof(ElemType) * c.GetNumElements()); + } + else if (beta != 1) + { +#pragma omp parallel for + foreach_coord(i,j,c) + { + c(i,j) = beta * c(i,j); + } + } + + if (rhs.GetFormat() != matrixFormatSparseCSC) + NOT_IMPLEMENTED; + + if (!transposeA && !transposeB) + { + for(size_t j = 0; j < rhs.GetNumCols(); j++) + { + size_t start = rhs.m_compIndex[j]; //ColLocation + size_t end = rhs.m_compIndex[j+1]; + for(size_t p = start; p < end; p++) + { + size_t i = rhs.m_unCompIndex[p]; //RowLocation + ElemType val = rhs.m_pArray[p]; + + for(size_t h = 0; h < lhs.GetNumRows(); h++) + { + c(h,j) += alpha * lhs(h, i)*val; + } + } + } + } + else if (!transposeA && transposeB) + { + for(size_t j = 0; j < rhs.GetNumCols(); j++) + { + size_t start = rhs.m_compIndex[j]; + size_t end = rhs.m_compIndex[j + 1]; + + for(size_t p = start; p < end; p++) + { + size_t i = rhs.m_unCompIndex[p]; + ElemType val = rhs.m_pArray[p]; + for(size_t h = 0; h < lhs.GetNumRows(); h++) + { + c(h, i) += alpha * lhs(h, j)*val; + } + } + } + } + else if (transposeA && !transposeB) + { + NOT_IMPLEMENTED; + } + else + { + NOT_IMPLEMENTED; + } + } + + //c = alpha * op(lhs) * op(rhs) + template + void CPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, + const CPUSparseMatrix& rhs, const bool transposeB, CPUSparseMatrix& c) + { + if (lhs.IsEmpty() || rhs.IsEmpty()) + throw std::logic_error("LeftMultiplyAndAdd: one of the input matrix is empty."); + + int m = transposeA? (int)lhs.GetNumCols(): (int)lhs.GetNumRows(); + int k = transposeA? (int)lhs.GetNumRows(): (int)lhs.GetNumCols(); + int l = transposeB? (int)rhs.GetNumCols(): (int)rhs.GetNumRows(); + int n = transposeB? (int)rhs.GetNumRows(): (int)rhs.GetNumCols(); + + assert (m>0 && k>0 && l>0 && n>0); m; n; //converting from size_t to int may cause overflow + assert (k == l); + if (k != l) + { + throw std::invalid_argument("CPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match."); + } + + c.Reset(); + + if (!transposeA && !transposeB) + { + NOT_IMPLEMENTED; + } + else if (!transposeA && transposeB) + { + if (rhs.GetFormat() != matrixFormatSparseCSC) + NOT_IMPLEMENTED; + + //allocate enough memory + c.SetFormat(matrixFormatSparseBlockCol); + c.Resize(m, n, m*min(n, rhs.m_nz)); + + map w2Id; + for(size_t j = 0; j < rhs.GetNumCols(); j++) + { // j ranges over batches + size_t start = rhs.m_compIndex[j]; + size_t end = rhs.m_compIndex[j+1]; + + for(size_t p = start; p < end; p++) + { + size_t i = rhs.m_unCompIndex[p]; //i ranges over words + ElemType val = rhs.m_pArray[p]; //1 for(i, j) + + bool first = true; + if(w2Id.find(i) == w2Id.end()) + { + w2Id[i] = w2Id.size(); + c.m_blockIds[c.m_blockSize]=i; + c.m_blockSize++; + } + else + { + first = false; + } + size_t pos = w2Id[i] * lhs.GetNumRows(); + for(size_t h = 0; h < lhs.GetNumRows(); h++) + { // h range over hidden layer + if(first == true) + { + c.m_pArray[pos] = alpha*lhs(h, j)*val; + } else + { + c.m_pArray[pos] += alpha*lhs(h, j)*val; + } + pos++; + } + } + } + c.m_nz = c.m_blockSize * m; + if(c.m_nz > c.GetSizeAllocated()) + { + throw std::logic_error("sparse matrix out of range."); + } + //c.SetFormat(matrixFormatSparseBlockCol); + } + else if (transposeA && !transposeB) + { + NOT_IMPLEMENTED; + } + else + { + NOT_IMPLEMENTED; + } + } + + template + void CPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& rhs) + { + if (lhs.IsEmpty() || rhs.IsEmpty()) + { + throw std::logic_error("ScaleAndAdd: one of the input matrix is empty."); + } + + if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) + { + throw std::invalid_argument("CPUSparseMatrix::ScaleAndAdd: The dimensions of a and b must match."); + } + + if(lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSC || lhs.GetFormat() == MatrixFormat::matrixFormatSparseCSR) + { + size_t col_num = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? lhs.GetNumCols(): lhs.GetNumRows(); + for(size_t j = 0; j < col_num; j++) + { + size_t start = lhs.m_compIndex[j]; + size_t end = lhs.m_compIndex[j + 1]; + for(size_t p = start; p < end; p++) + { + size_t i = lhs.m_unCompIndex[p]; + ElemType val = lhs.m_pArray[p]; + size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j; + size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i; + rhs(r, c) += alpha * val; + } + } + } + else if (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol || lhs.m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + for(size_t j = 0; j < lhs.m_blockSize; j++) + { + size_t i = lhs.m_blockIds[j]; + size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols(); + size_t start = j * len; + for(size_t p = start; p < start+len; p++) + { + ElemType val = lhs.m_pArray[p]; + + size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; + size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); + rhs(r, c) += alpha * val; + } + } + } + else + { + throw std::runtime_error("CPUSparseMatrix:: ScaleAndAdd() Not implemented"); + } + } + + + template + bool CPUSparseMatrix::AreEqual(const CPUSparseMatrix& a, const CPUSparseMatrix& b, const ElemType threshold) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AreEqual: one of the input matrices is empty."); + + if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) + return false; + + bool result = true; + + #pragma omp parallel for + foreach_coord(i, j, a) + { + if (abs(a(i, j) - b(i, j)) > threshold) + { + result = false; + break; + } + } + + return result; + } + + // a: H x No: H is hidden layer size and No is mini-batch size + // weight: V x H, V is vocab size + // label: V x No + // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class + // idx2cls: V x 1, mapping from word to class id + // etp: V x No, stores predicted values + template + void CPUSparseMatrix::ClassEntropy(const CPUMatrix& a, const CPUMatrix& weight, + const CPUSparseMatrix & label, const CPUMatrix& cls, + const CPUMatrix& idx2cls, CPUSparseMatrix& etp, CPUMatrix& entropyScore) + { + if (a.IsEmpty() || cls.IsEmpty() || label.IsEmpty() || idx2cls.IsEmpty()) + throw std::logic_error("AssignSoftmaxOf: Matrix a, class, idx2cls or label is empty."); + + if(etp.GetFormat() != MatrixFormat::matrixFormatSparseCSC) + throw std::runtime_error("CPUSparseMatrix:: ClassEntropy() only support CSC"); + + size_t nC = cls.GetNumCols(); + size_t nV = label.GetNumRows() - nC; + + if (nV != idx2cls.GetNumRows() || idx2cls.GetNumCols() != 1 || cls.GetNumCols() + idx2cls.GetNumRows() != label.GetNumRows()) + throw std::logic_error("ClassEntropy: check matrix dimension"); + + //allocate enough memory + if(etp.m_elemSizeAllocated < etp.GetNumElements()) + { + etp.Resize(etp.GetNumRows(), etp.GetNumCols(), etp.GetNumElements(), true, false); + } + etp.Reset(); + + entropyScore(0, 0) = 0; + for(size_t j = 0; j < label.GetNumCols(); j++) + { + size_t start = label.m_compIndex[j]; + size_t end = label.m_compIndex[j + 1]; + for (size_t p = start; p < end; p++) + { + size_t i = label.m_unCompIndex[p]; + size_t iStt, iEnd; + if (i < nV) + { + size_t clsid = (size_t)idx2cls(i, 0); + iStt = (size_t) cls(0, clsid); //class start word id + iEnd = (size_t) cls(1, clsid); //class end word id + } + else + { + iStt = nV; + iEnd = nV + nC; + } + + size_t b = etp.m_nz; + for(size_t ii = iStt; ii < iEnd; ii++) //ii ranges over sub-vocab or class ids + { + ElemType val = 0.0; + foreach_row(rw, a) //rw ranges over hidden units + { + val += weight(ii,rw) * a(rw,j); + } + etp.SetValue(ii, j, val); + } + ElemType maxV = LZERO; + for(size_t ii = b; ii < etp.m_nz; ii++) + { + maxV = (ElemType) logadd(maxV, etp.m_pArray[ii]); + } + + for(size_t ii = b; ii < etp.m_nz; ii++) + { + etp.m_pArray[ii] = etp.m_pArray[ii] - maxV; + } + + entropyScore(0, 0) -= etp.m_pArray[b+i-iStt]; + //negate positive data points + etp.m_pArray[b+i-iStt] *=-1; + } + } + } + + + template + void CPUSparseMatrix::ClassEntropyError(CPUSparseMatrix& a) + { + for(int i = 0; i < a.m_nz; i++) + { + if(a.m_pArray[i] < 0) + { + a.m_pArray[i] = exp(a.m_pArray[i]); //negative; + } + else + { + a.m_pArray[i] = exp(-a.m_pArray[i])-1; //positive + } + } + } + + + template + void CPUSparseMatrix::ClassEntropyGradientOfInput( + const CPUSparseMatrix& error, + const CPUMatrix& weight, + CPUMatrix& grd) + { + grd.SetValue(0); + + for(size_t j = 0; j < error.GetNumCols(); j++) + { + size_t start = error.m_compIndex[j]; + size_t end = error.m_compIndex[j+1]; + for(size_t p = start; p < end; p++) + { + size_t i = error.m_unCompIndex[p]; + for(size_t h = 0; h < grd.GetNumRows(); h++) + { // h ranges over hidden units + grd(h,j) += weight(i, h) * error.m_pArray[p]; + } + } + } + } + + + + template + void CPUSparseMatrix::ClassEntropyGradientOfWeight( + const CPUSparseMatrix& error, + const CPUMatrix& input, + const CPUSparseMatrix & /*label*/, + const CPUMatrix& /*cls*/, + const CPUMatrix& /*idx2cls*/, + CPUSparseMatrix& grd) + { + grd.SetFormat(matrixFormatSparseBlockRow); + //allocate enough memory + grd.Resize(grd.GetNumRows(), grd.GetNumCols(), error.m_nz*input.GetNumRows(), true, false); + + grd.Reset(); + map w2Id; + for(size_t j = 0; j < error.GetNumCols(); j++) + { + size_t start = error.m_compIndex[j]; + size_t end = error.m_compIndex[j+1]; + + for(size_t p = start; p < end; p++) + { + size_t i = error.m_unCompIndex[p]; // i ranges over words + bool first = true; + if(w2Id.find(i) == w2Id.end()) + { + w2Id[i] = w2Id.size(); + grd.m_blockIds[grd.m_blockSize]=i; + grd.m_blockSize++; + } + else + { + first = false; + } + size_t pos = w2Id[i]*input.GetNumRows(); + for(size_t h = 0; h < input.GetNumRows(); h++) + { // h range over hidden layer + if(first == true) + { + grd.m_pArray[pos] = input(h, j)*error.m_pArray[p]; + } + else + { + grd.m_pArray[pos] += input(h, j)*error.m_pArray[p]; + } + pos++; + } + } + } + grd.m_nz = grd.m_blockSize * input.GetNumRows(); + if(grd.m_nz > grd.GetSizeAllocated()) + { + throw std::logic_error("sparse matrix out of range."); + } + //grd.SetFormat(matrixFormatSparseBlockRow); + } + + // normal update for smoothed gradients c and current gradients (this) + template + void CPUSparseMatrix::NormalGrad(CPUMatrix& c, const ElemType momentum) + { + if (c.IsEmpty()) + { + c.Resize(GetNumRows(), GetNumCols()); + c.SetValue(0.0); + } + + if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + for(size_t j = 0; j < m_blockSize; j++) + { + size_t i = m_blockIds[j]; + size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); + size_t start = j* len; + for(size_t p = start; p < start+len; p++) + { + ElemType val = m_pArray[p]; + size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; + size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); + c(row, col) = (1-momentum)*val + momentum*c(row, col); + m_pArray[p] = c(row, col); + } + } + } + else + { + throw std::runtime_error("CPUSparseMatrix:: NormalGrad() only support block sparse format"); + } + } + + // update smoothed gradients c and current gradients (this) + template + void CPUSparseMatrix::Adagrad(CPUMatrix& c) + { + if (c.IsEmpty()) + { + c.Resize(GetNumRows(), GetNumCols()); + c.SetValue(0.0); + } + + const ElemType floor = 1e-16f; + if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) + { + size_t col_num = (m_format == MatrixFormat::matrixFormatSparseCSC) ? GetNumCols() : GetNumRows(); + for(size_t j = 0; j < col_num; j++) + { + size_t start = m_compIndex[j]; + size_t end = m_compIndex[j+1]; + for(size_t p = start; p < end; p++) + { + size_t i = m_unCompIndex[p]; + ElemType val = m_pArray[p]; + + size_t row = (m_format == MatrixFormat::matrixFormatSparseCSC) ? i : j; + size_t col = (m_format == MatrixFormat::matrixFormatSparseCSC) ? j : i; + ElemType adenorm = c(row, col); + adenorm += val * val; + val = val / (floor + sqrt(adenorm)); + m_pArray[p] = val; + c(row, col) = adenorm; + } + } + } else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + for(size_t j = 0; j < m_blockSize; j++) + { + size_t i = m_blockIds[j]; + size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); + size_t start = j* len; + for(size_t p = start; p < start+len; p++) + { + ElemType val = m_pArray[p]; + + size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i; + size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start); + ElemType adenorm = c(row, col); + adenorm += val * val; + val = val / (floor + sqrt(adenorm)); + m_pArray[p] = val; + c(row, col) = adenorm; + } + } + } + } + + template + CPUSparseMatrix& CPUSparseMatrix::InplaceTruncate (const ElemType threshold) + { + if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) + { + ElemType locThresholdPos = abs(threshold); + ElemType locTHresholdNeg = -locThresholdPos; + + for(size_t j = 0; j < m_blockSize; j++) + { + size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols(); + size_t start = j* len; + for (size_t p = start; p < start+len; p++) + { + if (m_pArray[p] > locThresholdPos) + { + m_pArray[p] = locThresholdPos; + } + else if (m_pArray[p] < locTHresholdNeg) + { + m_pArray[p] = locTHresholdNeg; + } + } + } + } + else + { + throw std::runtime_error("CPUSparseMatrix:: InplaceTruncate() only support block based sparse matrix"); + } + return *this; + } + + template + MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) + { + stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); + size_t elsize; + stream >> elsize; + if (sizeof(ElemType) != elsize) + throw std::runtime_error("Template argument size doesn't match those in file"); + std::wstring matrixName; + + // now prepare this header to receive the data being read + size_t nz, colnum, rownum; + int format; + + // read in the header information + stream >> matrixName >> format >> nz >> colnum >> rownum; + + us.SetFormat((MatrixFormat)format); + if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR) + NOT_IMPLEMENTED; + + us.Resize(rownum, colnum, nz); + + if (nz > 0) + { + size_t compressedSize = (us.GetFormat() == matrixFormatSparseCSC) ? colnum + 1 : rownum + 1; + ElemType* dataBuffer = us.NzValues(); + CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation(); + CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation(); + + // read in the sparse matrix info + for (size_t i = 0; i < nz; ++i) + { + stream >> dataBuffer[i]; + } + for (size_t i = 0; i < nz; ++i) + { + stream >> unCompressedIndex[i]; + } + for (size_t i = 0; i < compressedSize; ++i) + { + stream >> compressedIndex[i]; + } + } + stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); + + us.SetMatrixName(matrixName.c_str()); + + return stream; + } + + template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); + template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); + + template + MATH_API File& operator<<(File& stream, const CPUSparseMatrix& us) + { + if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR) + NOT_IMPLEMENTED; + + stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); + stream << sizeof(ElemType); + if (us.GetMatrixName() == nullptr) + { + std::wstring s(L"nnmatrix"); + stream << s; + } + else + { + stream << us.GetMatrixName(); + } + + size_t nz, numRows, numCols; + size_t compressedSize = us.SecondaryIndexCount(); + int format = us.GetFormat(); + + stream << format << nz << numCols << numRows; + + if (nz > 0) + { + ElemType* dataBuffer = us.NzValues(); + CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation(); + CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation(); + + for (size_t i = 0; i < nz; ++i) + { + stream << dataBuffer[i]; + } + for (size_t i = 0; i < nz; ++i) + { + stream << unCompressedIndex[i]; + } + for (size_t i = 0; i < compressedSize; ++i) + { + stream << compressedIndex[i]; + } + } + stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT")); + + return stream; + } + + template class CPUSparseMatrix; + template class CPUSparseMatrix; + +}}} diff --git a/Math/Math/CPUSparseMatrix.h b/Math/Math/CPUSparseMatrix.h index 43885f928..aac886ec1 100644 --- a/Math/Math/CPUSparseMatrix.h +++ b/Math/Math/CPUSparseMatrix.h @@ -33,7 +33,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: CPUSparseMatrix(const MatrixFormat format); CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size); - + + ~CPUSparseMatrix(); public: @@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& c); + static bool AreEqual(const CPUSparseMatrix& a, const CPUSparseMatrix& b, const ElemType threshold = 1e-8); + /// sum(vec(a).*vec(b)) static ElemType InnerProductOfMatrices(const CPUSparseMatrix& /*a*/, const CPUMatrix& /*b*/) { NOT_IMPLEMENTED; } @@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true); void Reset(); + inline ElemType defaultElem() + { + ElemType default; + memset(&default, 0, sizeof(ElemType)); + return default; + } + + const ElemType& operator() (const size_t row, const size_t col) const + { + if (col >= m_numCols || row >= m_numRows) + { + throw std::runtime_error("Position outside matrix dimensions"); + } + + if (m_format == MatrixFormat::matrixFormatSparseCSC) + { + size_t start = m_compIndex[col]; + size_t end = m_compIndex[col + 1]; + for (size_t p = start; p < end; p++) + { + size_t i = m_unCompIndex[p]; + if (i == row) + { + return m_pArray[p]; + } + } + + return m_default; + } + else + { + NOT_IMPLEMENTED; + } + } + public: void NormalGrad(CPUMatrix& c, const ElemType momentum); void Adagrad(CPUMatrix& c); @@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: const ElemType* NzValues() const { return m_pArray; } - ElemType* NzValues() { return m_pArray; } + inline ElemType* NzValues() { return m_pArray; } size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format @@ -139,9 +177,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { CPUSPARSE_INDEX_TYPE *m_unCompIndex; //row/col ids in CSC/CSR format CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format - size_t m_blockSize; //block size - ElemType *m_blockVal; //block values + size_t m_blockSize; //block size size_t *m_blockIds; //block ids + + ElemType m_default; }; typedef CPUSparseMatrix CPUSingleSparseMatrix; diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 0f2359078..bd6089240 100644 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -1,3479 +1,3440 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - -#include "stdafx.h" -#include "BestGpu.h" - -#ifndef CPUONLY - -#include "cublas_v2.h" -#include -#include -#include -#include -#include -#include "device_launch_parameters.h" -#include "GPUMatrix.h" -#include "GPUMatrixCUDAKernels.cu" -#include "GPUSparseMatrix.h" -#include // for cout - -#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs -#pragma comment (lib, "cublas.lib") -#pragma comment (lib, "cusparse.lib") -#pragma comment (lib, "curand.lib") - -#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<>> syntax if a and b are size_t -#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this -#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons - -#ifdef NO_SYNC -bool do_sync = false; -#else -bool do_sync = true; -#endif - -#ifdef _WIN32 -// thread local storage to access the current stream, initalize to default stream -__declspec (thread) -#endif -cudaStream_t t_stream = cudaStreamDefault; - -extern int _ConvertSMVer2Cores(int major, int minor); // forward declaration - -// SetStream - set the stream that will be used by the GPU routines -void MATH_API SetStream(cudaStream_t stream) -{ - t_stream = stream; -} - -// GetStream - get the stream that will be used by the GPU routines -cudaStream_t MATH_API GetStream() -{ - return t_stream; -} - - -void CURAND_CALL(curandStatus x) -{ - if(x!=CURAND_STATUS_SUCCESS) - { - throw std::runtime_error("CURAND fail"); - } -} - -void CUBLAS_CALL(cublasStatus_t x) -{ - if(x!=CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("CUBLAS fail"); - } -} - -void CUDA_CALL(cudaError_t x) -{ - if(x!=cudaSuccess) - { - const char* errmsg = cudaGetErrorString(x); - std::cerr << "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl; - cudaDeviceSynchronize(); - throw std::runtime_error(errmsg); - } -} - -namespace Microsoft { namespace MSR { namespace CNTK { - - // PrepareDevice - Setup the correct cuda context for an operation - // deviceId - the device on which the operation will take place - void PrepareDevice(DEVICEID_TYPE deviceId) - { - static DEVICEID_TYPE currentDevice = AUTOPLACEMATRIX; // set to anything valid - // externally managed matrices are guaranteed to be on the right device - if (deviceId == MANAGEDEXTERN) - return; - // and if we last set the device to be this device we are good - if (deviceId == currentDevice) - return; - CUDA_CALL(cudaSetDevice(deviceId)); - currentDevice=deviceId; - } - -#pragma region DeviceBoundNumber class - - template - DeviceBoundNumber::DeviceBoundNumber(const DeviceBoundNumber &/*deepCopy*/) - { - NOT_IMPLEMENTED; - } - - template - DeviceBoundNumber::DeviceBoundNumber(DeviceBoundNumber &&shallowCopy) - { - ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice); - shallowCopy.m_data=NULL; - } - - template - void DeviceBoundNumber::ShallowCopyFrom(ElemType* newVal,int newValsDevceId) - { - m_computeDevice = newValsDevceId; - m_data = newVal; - } - - template - DeviceBoundNumber::~DeviceBoundNumber() - { - if (m_data!=NULL) - { - if (m_computeDevice<0) - { - delete m_data; - m_data = NULL; - } - else if (m_computeDevice != MANAGEDEXTERN) - CUDA_CALL(cudaFree(m_data)); - } - } - -#pragma endregion DeviceBoundNumber class - -#pragma region Helper functions - template - cublasHandle_t _initCUBLAS(int devId) - { - PrepareDevice((DEVICEID_TYPE)devId); - cublasHandle_t cuHandle; - CUBLAS_CALL(cublasCreate(&cuHandle)); - return cuHandle; - } - - // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information - // TODO: should be replaced by BestGpu class instead, it's much better - template - DEVICEID_TYPE GPUMatrix::GetBestGPUDeviceId() //returns -1 if no GPUs can be used - { - // currently there is little point in giving out different device IDs each time ask for a matrix, - // we really want them all on the same device eventually - static int chosenDeviceId = AUTOPLACEMATRIX; - if (chosenDeviceId != AUTOPLACEMATRIX) - return chosenDeviceId; - - __try - { - // stash previous device state - // if there was one on entry: - int nPrevDev = -1; - cudaError_t ePrevDev = cudaGetDevice(&nPrevDev); - - int deviceCount = -1; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - if (error_id != cudaSuccess || deviceCount==0) - { - return -1; - } - - int setDev = -1; - int curDev=0; - long curPower = 0; - for (DEVICEID_TYPE dev = 0; dev < deviceCount; ++dev) - { - CUDA_CALL(cudaSetDevice(dev)); - setDev = dev; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - long power = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; - //long power = _GetFreeMemoryOnCUDADevice(dev); - if (power>curPower) - { - curPower=power; - curDev = dev; - } - } - - if(nPrevDev >= 0 && ePrevDev == cudaSuccess && - setDev >= 0 && setDev != nPrevDev) { - // restore current context to the one we entered with - // if there was one the caller might want unchanged. - cudaSetDevice(nPrevDev); - } - chosenDeviceId = curDev; - return curDev; - } - __except (1) - { - return -1; // CPU - } - } - - // PrepareDevice - Setup the correct cuda context for an operation - // deviceId - the device on which the operation will take place - // defaults to -1, which means use matrices current device - template - DEVICEID_TYPE GPUMatrix::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const - { - // if default value use current compute device - DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice; - - Microsoft::MSR::CNTK::PrepareDevice(newId); - return newId; - } - - template - ElemType* GPUMatrix::CopyToArray() const - { - size_t numElements = GetNumElements(); - if (numElements != 0) - { - PrepareDevice(); - ElemType* pArray = new ElemType[numElements]; - CUDA_CALL(cudaMemcpy(pArray,m_pArray,sizeof(ElemType)*m_numRows*m_numCols,cudaMemcpyDeviceToHost)); - return pArray; - } - else - { - return NULL; - } - } - - //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done - //return number of elements copied - template - size_t GPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const - { - size_t numElements = GetNumElements(); - - if (numElements > currentArraySize) - { - delete arrayCopyTo; - arrayCopyTo = new ElemType[numElements]; - currentArraySize = numElements; - } - - if (numElements != 0) - { - PrepareDevice(); - CUDA_CALL(cudaMemcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements, cudaMemcpyDeviceToHost)); - } - - return numElements; - } - - template - void GPUMatrix::ChangeDeviceTo(DEVICEID_TYPE to_id) - { - if (!OwnBuffer()) - throw std::logic_error("Cannot change device on Managed external matrix"); - if (to_id == CPUDEVICE) - throw std::logic_error("to_id must be valid GPU"); - if (m_computeDevice==to_id) - return; - - PrepareDevice((DEVICEID_TYPE)to_id); - ElemType* d_dst=NULL; - CUDA_CALL(cudaMalloc((void**)&d_dst,sizeof(ElemType)*m_numRows*m_numCols)); - - m_elemSizeAllocated = m_numRows*m_numCols; - - // check to make sure we have something to copy (on init we often have zero sized allocations) - if (m_elemSizeAllocated > 0) - { - // first try peer access - int canAccessPeer = false; - CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice)); - if (canAccessPeer) - { - CUDA_CALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0)); - CUDA_CALL(cudaMemcpyPeer(d_dst,to_id,m_pArray,m_computeDevice,sizeof(ElemType)*m_numRows*m_numCols)); - } - else - { - // peer access didn't work, just copy normal - // make this more efficient by keeping some buffers available for each copy - ElemType* h_dst=NULL; - PrepareDevice(); - CUDA_CALL(cudaMallocHost((void**)&h_dst,sizeof(ElemType)*m_numRows*m_numCols)); - CUDA_CALL(cudaMemcpy(h_dst,m_pArray,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyDeviceToHost)); - PrepareDevice((DEVICEID_TYPE)to_id); - CUDA_CALL(cudaMemcpy(d_dst,h_dst,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyHostToDevice)); - CUDA_CALL(cudaFreeHost(h_dst)); - } - } - PrepareDevice(); - CUDA_CALL(cudaFree(m_pArray)); - m_pArray=d_dst; - - PrepareDevice((DEVICEID_TYPE)to_id); - m_computeDevice=to_id; - } - - template - void GPUMatrix::performInplaceFunction(int kind) - { - PrepareDevice(); - LONG64 N= (LONG64) GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - switch (kind) - { - case 0: - _inplaceSigmoidOnCuda<<>>(m_pArray, N); - break; - case 1: - _inplaceTanhOnCuda<<>>(m_pArray, N); - break; - case 2: - _inplaceSqrtOnCuda<<>>(m_pArray, N); - break; - case 3: - _inplaceExpOnCuda<<>>(m_pArray,N); - break; - case 4: - _inplaceLogOnCuda<<>>(m_pArray,N); - break; - case 5: - _inplaceAbsOnCuda<<>>(m_pArray,N); - break; - case 6: - _inplaceLinRectDerivative<<>>(m_pArray,N); - break; - case 7: - _inplaceCosineOnCuda<<>>(m_pArray,N); - break; - case 8: - _inplaceNegativeSineOnCuda<<>>(m_pArray,N); - break; - } - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - -#pragma endregion Helper functions - -#pragma region Constructors and Destructor - - //should only be used by constructors. - template - void GPUMatrix::ZeroInit(int deviceId) - { - m_computeDevice = deviceId; - m_pArray = nullptr; - m_numRows = 0; - m_numCols = 0; - m_elemSizeAllocated = 0; - m_matrixName=NULL; - m_format = matrixFormatDense; - m_externalBuffer = false; - } - - template - GPUMatrix::GPUMatrix(int deviceId) - { - if (deviceId == MANAGEDEXTERN) - throw std::logic_error("Basic constructor cannot be used with Managed Extern types"); - - ZeroInit(deviceId); - }; - - //matrixName is used to verify that correct matrix is read. - template - GPUMatrix::GPUMatrix(FILE* f, const char * matrixName, int deviceId) - { - if (deviceId == MANAGEDEXTERN) - throw std::logic_error("File constructor cannot be used with Managed Extern types"); - - ReadFromFile(f, matrixName); - } - - template - GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId) - { - if (deviceId == MANAGEDEXTERN) - throw std::logic_error("constructor cannot be used with Managed Extern types"); - ZeroInit(deviceId); - m_numRows = numRows; - m_numCols = numCols; - m_elemSizeAllocated = GetNumElements(); - - if (m_elemSizeAllocated != 0) - { - PrepareDevice(); - CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated)); - CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated)); - } - }; - - template - GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId) - { - ZeroInit(deviceId); - SetValue(numRows, numCols, pArray, matrixFlags, deviceId); - }; - - template - GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) - { - ZeroInit(deepCopyFrom.m_computeDevice); - SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); - } - - template - GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) - { - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_computeDevice = moveFrom.m_computeDevice; - m_pArray = moveFrom.m_pArray; //shallow copy the pointer - m_matrixName=moveFrom.m_matrixName; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; - m_format = moveFrom.m_format; - m_externalBuffer = moveFrom.m_externalBuffer; - - //release the pointer from the source object so that the destructor won't release it twice - moveFrom.ZeroInit(0); - } - - //assignment operator, deep copy - template - GPUMatrix& GPUMatrix::operator=(const GPUMatrix& deepCopyFrom) - { - if (this != &deepCopyFrom) - { - SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); - } - return *this; - } - - //move assignment operator, shallow copy - template - GPUMatrix& GPUMatrix::operator=(GPUMatrix&& moveFrom) - { - if (this != &moveFrom) - { - if (OwnBuffer() && m_pArray!=NULL) - { - CUDA_CALL(cudaFree(m_pArray)); - } - - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; - m_pArray = moveFrom.m_pArray; - m_computeDevice = moveFrom.m_computeDevice; - m_format = moveFrom.m_format; - m_externalBuffer = moveFrom.m_externalBuffer; - - //release the pointer from the source object so that the destructor won't release it twice - moveFrom.ZeroInit(0); - } - return *this; - } - - template - GPUMatrix::~GPUMatrix(void) - { - Clear(); - } - - template - void GPUMatrix::Clear() - { - if (OwnBuffer() && m_pArray!=NULL) - { - if (m_computeDevice>=0) - { - PrepareDevice(); - cudaFree(m_pArray); - m_pArray = NULL; - m_elemSizeAllocated = 0; - } - } - BaseMatrix::Clear(); - - ZeroInit(m_computeDevice); - } -#pragma endregion Constructors and Destructor - - template - int GPUMatrix::GetComputeDeviceId() const - { - // for externally managed memory the CUDA context will have the current device - if (m_computeDevice == MANAGEDEXTERN) - { - int devId; - assert(m_externalBuffer); - CUDA_CALL(cudaGetDevice(&devId)); - return devId; - } - return m_computeDevice; - } - -#pragma region Basic Operators - template - GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const - { - if (numCols == 0) - throw std::logic_error("The slice cannot have 0 columns."); - - if (startColumn + numCols > m_numCols) - throw std::logic_error("The slice is out of range of the source matrix."); - - GPUMatrix slice(m_numRows, numCols, m_pArray + startColumn * m_numRows, matrixFlagDontOwnBuffer, m_computeDevice); - - return slice; - } - - template - GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) - { - if (numCols == 0) - throw std::logic_error("The slice cannot have 0 columns."); - - if (startColumn + numCols > m_numCols) - throw std::logic_error("The slice is out of range of the source matrix."); - - Clear(); - - m_computeDevice=fromMatrix.m_computeDevice; - m_externalBuffer=true; - m_numRows = fromMatrix.m_numRows; - m_pArray=fromMatrix.m_pArray + startColumn * m_numRows; - - m_elemSizeAllocated = GetNumElements(); - m_matrixName=NULL; - m_format = fromMatrix.m_format; - - return *this; - } - - - //for each column of a, we assign numRows starting from startIndex to this - template - GPUMatrix& GPUMatrix::AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) - { - if (a.IsEmpty()) - throw std::logic_error("AssignRowSliceValuesOf: input matrix a is empty."); - - if (startIndex + numRows > a.GetNumRows()) - throw std::logic_error("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); - - Resize(numRows, a.GetNumCols()); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignRowSliceValuesOf<<>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)numRows, (long)a.GetNumRows()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - //for the row slice of this starting from startIndex we add a to it. - template - GPUMatrix& GPUMatrix::AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) - { - if (a.IsEmpty()) - throw std::logic_error("AddToRowSliceValuesOf: input matrix a is empty."); - - if (a.GetNumRows() != numRows) - throw std::logic_error("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); - - if (startIndex + numRows > GetNumRows()) - throw std::logic_error("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); - - if (a.GetNumCols() != GetNumCols()) - throw std::logic_error("AddToRowSliceValuesOf: columns does not match."); - - LONG64 N=(LONG64)a.GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addToRowSliceValuesOf<<>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - //for each column of this, we add row slice of a starting from startIndex - template - GPUMatrix& GPUMatrix::AddWithRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) - { - if (a.IsEmpty()) - throw std::logic_error("AddWithRowSliceValuesOf: input matrix a is empty."); - - if (GetNumRows() != numRows) - throw std::logic_error("AddWithRowSliceValuesOf: GetNumRows() != numRows."); - - if (startIndex + numRows > a.GetNumRows()) - throw std::logic_error("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); - - if (a.GetNumCols() != GetNumCols()) - throw std::logic_error("AddWithRowSliceValuesOf: columns does not match."); - - LONG64 N = (LONG64)GetNumElements(); - int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addWithRowSliceValuesOf << > >(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignRepeatOf(const GPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) - { - if (this == &a) - throw std::logic_error("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); - - if (a.IsEmpty()) - throw std::logic_error("AssignRepeatOf: Matrix a is empty."); - - Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); - - LONG64 N = (LONG64)GetNumElements(); - long n = (long)a.GetNumCols(), m = (long)a.GetNumRows(); - int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignRepeatOf << > >(m_pArray, a.m_pArray, N, m, n, (long)GetNumRows()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix GPUMatrix::Transpose() const - { - if (IsEmpty()) - throw std::logic_error("Transpose: Matrix is empty."); - - GPUMatrix c(GetComputeDeviceId()); - c.AssignTransposeOf(*this); - return c; - } - - // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU - // computeDevice - The compute device for which the cublas handle is desired - // returns: cublas handle - // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends - template - cublasHandle_t GPUMatrix::GetCublasHandle(int computeDevice/*=-1*/) - { - // if the compute device is not passed, get the current device from CUDA - if (computeDevice < 0) - cudaGetDevice(&computeDevice); - - if (computeDevice < 0 || computeDevice >= MaxGpus) - throw std::logic_error("GetCublasHandle: Maximum GPU exceeded"); - cublasHandle_t cuHandle = s_cuHandle[computeDevice]; - if (cuHandle == NULL) - { - s_cuHandle[computeDevice] = cuHandle = _initCUBLAS(computeDevice); - } - CUBLAS_CALL(cublasSetStream(cuHandle, t_stream)); - - return cuHandle; - } - - template - GPUMatrix& GPUMatrix::AssignTransposeOf (const GPUMatrix& a) - { - if (this == &a) - throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); - - if (a.IsEmpty()) - throw std::logic_error("AssignTransposeOf: Matrix a is empty."); - - if (GetNumRows()!=a.GetNumCols() || GetNumCols()!=a.GetNumRows()) - Resize(a.GetNumCols(), a.GetNumRows()); - - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - cublasOperation_t transA = CUBLAS_OP_T; - cublasOperation_t transB = CUBLAS_OP_T; - int m = (int)a.m_numCols; - int n = (int)a.m_numRows; - ElemType alpha=1; - ElemType beta=0; - cublasStatus_t st; - if (sizeof(ElemType)==sizeof(float)) - { - st = cublasSgeam(cuHandle,transA,transB,m,n,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(&beta),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(m_pArray),(int)m_numRows); - } - else if (sizeof(ElemType)==sizeof(double)) - { - st = cublasDgeam(cuHandle,transA,transB,m,n,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(&beta),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(m_pArray),(int)m_numRows); - } - else - { - throw std::runtime_error("Unsupported template argument in GPUMatrix"); - } - if (st!=CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("AssignTransposeOf failed"); - } - m_numRows=a.m_numCols; - m_numCols=a.m_numRows; - SetMatrixName(a.GetMatrixName()); - return *this; - } - - template - void GPUMatrix::SetValue(const ElemType v) - { - if (IsEmpty()) - throw std::logic_error("SetValue: Matrix is empty."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setValue<<>>(m_pArray,v,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory - { - if (IsEmpty()) - throw std::logic_error("SetValue: Matrix is empty."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setValue<<>>(m_pArray,d_v,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::SetColumn(const ElemType* colPointer, size_t colInd) - { - if (IsEmpty()) - throw std::logic_error("SetValue: Matrix is empty."); - if (colPointer==NULL) - return; - CUDA_CALL(cudaMemcpy(m_pArray+LocateColumn(colInd),colPointer,sizeof(ElemType)*m_numRows,cudaMemcpyHostToDevice)); - } - - template - void GPUMatrix::SetValue(const GPUMatrix& deepCopyFrom) - { - if (this == &deepCopyFrom) - return; - - Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols()); - m_format = deepCopyFrom.m_format; // copy the format over just to be sure - size_t cpSize = deepCopyFrom.GetNumRows() * deepCopyFrom.GetNumCols(); - if (cpSize != 0) - CUDA_CALL(cudaMemcpy(m_pArray,deepCopyFrom.m_pArray,cpSize*sizeof(ElemType),cudaMemcpyDeviceToDevice)); - } - - template - void GPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId) - { - // handle externally managed case - if (matrixFlags&matrixFlagDontOwnBuffer) - { - // free the existing array if it used to be an owned array - if (OwnBuffer() && m_pArray!=NULL) - { - PrepareDevice(); - CUDA_CALL(cudaFree(m_pArray)); - } - m_numRows = numRows; - m_numCols = numCols; - m_pArray = pArray; - m_elemSizeAllocated = GetNumElements(); - m_matrixName = NULL; - m_format = matrixFormatDense; - m_externalBuffer = true; - m_computeDevice = deviceId; - } - else - { - // if didn't previously own the buffer, wipe it clean - if (!OwnBuffer()) - { - ZeroInit(deviceId); - } - - // if the devices are different move it now - if (m_computeDevice != deviceId && deviceId >= 0) - { - Clear(); - ZeroInit(deviceId); - } - - // now resize/allocate as necessary - Resize(numRows, numCols); - m_externalBuffer = false; - - // copy over the content to the buffer - PrepareDevice(); - if (pArray!=NULL) - { - if (!(matrixFlags&matrixFormatRowMajor)) - { - CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(), - (matrixFlags&matrixFlagSetValueOnDevice)?cudaMemcpyDeviceToDevice:cudaMemcpyHostToDevice)); - } - else - { - throw std::runtime_error("Row major isn't implemented"); - } - } - } - m_format = matrixFormatDense; - } - - - template - void GPUMatrix::SetDiagonalValue(const ElemType v) - { - unsigned long N=(unsigned long)GetNumRows(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setDiagonalValue<<>>(m_pArray,v,N,(unsigned long)GetNumRows()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::SetDiagonalValue(GPUMatrix& vector) - { - if (IsEmpty() || vector.IsEmpty()) - throw std::logic_error("SetDiagonalValue: Matrix is empty."); - - if (GetNumRows() != GetNumCols()) - throw std::logic_error("SetDiagonalValue: NumRows and NumCols do not agree."); - - if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) - throw std::logic_error("SetDiagonalValue: input vector must be a vector."); - - if (vector.GetNumElements() == 1) //reduce to simple form - SetDiagonalValue(vector.m_pArray[0]); - - else if (vector.GetNumRows() != GetNumRows()) - throw std::logic_error("SetDiagonalValue: input vector's dimension does not agree with [this]."); - else - { - long N=(long)GetNumRows(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setDiagonalValueFromVector<<>>(m_pArray,vector.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - template - void GPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) - { - PrepareDevice(); - if (s_curandGenerator==NULL) - { - s_curandGenerator = new curandGenerator_t; - /* Create pseudo-random number generator */ - CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); - CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); - CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); - } - - cudaEvent_t done = nullptr; - CUDA_CALL(cudaEventCreate(&done)); - if (sizeof(ElemType)==sizeof(float)) - { - CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements())); - } - else - { - CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements())); - } - CUDA_CALL(cudaEventRecord(done)); - CUDA_CALL(cudaEventSynchronize(done)); - //CURAND_CALL(curandDestroyGenerator(gen)); - CUDA_CALL(cudaEventDestroy(done)); - - size_t N=GetNumElements(); - size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock); - - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _rescaleToRange<<>>(m_pArray,N,low,high); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) - { - PrepareDevice(); - if (s_curandGenerator==NULL) - { - s_curandGenerator = new curandGenerator_t; - /* Create pseudo-random number generator */ - CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); - CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); - CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); - } - - if (sizeof(ElemType)==sizeof(float)) - { - CURAND_CALL(curandGenerateNormal(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements(), (float)mean, (float)sigma)); - } - else - { - CURAND_CALL(curandGenerateNormalDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements(), (double)mean, (double)sigma)); - } - //CURAND_CALL(curandDestroyGenerator(gen)); - } - - //maskRate: percentage of values masked out (similar to dropout rate) - //scaleValue: which scale value to set to the left ones (unmasked items). - template - void GPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) - { - PrepareDevice(); - if (s_curandGenerator==NULL) - { - s_curandGenerator = new curandGenerator_t; - /* Create pseudo-random number generator */ - CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); - CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); - CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); - } - - cudaEvent_t done = nullptr; - CUDA_CALL(cudaEventCreate(&done)); - if (sizeof(ElemType)==sizeof(float)) - { - CURAND_CALL(curandGenerateUniform((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(m_pArray), GetNumElements())); - } - else - { - CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(m_pArray), GetNumElements())); - } - CUDA_CALL(cudaEventRecord(done)); - CUDA_CALL(cudaEventSynchronize(done)); - CUDA_CALL(cudaEventDestroy(done)); - //CURAND_CALL(curandDestroyGenerator(gen)); - - size_t N=GetNumElements(); - size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setMaskAndScale<<>>(m_pArray,N,maskRate,scaleValue); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::Adagrad(GPUMatrix& gradients) - { - if (IsEmpty()) - { - Resize(gradients.GetNumRows(), gradients.GetNumCols()); - SetValue(0.0); - } - - assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols()); - - int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock; - _adagrad<<>>(m_pArray, gradients.m_pArray, GetNumElements()); - } - - template - void GPUMatrix::RmsProp(GPUMatrix& gradients, - ElemType RMS_GAMMA, - ElemType RMS_WGT_INC, - ElemType RMS_WGT_MAX, - ElemType RMS_WGT_DEC, - ElemType RMS_WGT_MIN - ) - { - const ElemType floor = 1e-6f; - static ElemType *upd_gpu = (ElemType*)0; - - size_t n = gradients.GetNumElements(); - int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock; - - if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3) - { - Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3); - SetValue(0.0); - - ElemType *avars=m_pArray; // accumulated variances for RMS scaling - ElemType *signs=m_pArray+n; // sign of previous gradient - ElemType *steps=m_pArray+2*n; // current step size - - _rmsprop_init<<>>(avars,signs,steps,gradients.m_pArray,n); - - } - - ElemType *avars=m_pArray; // accumulated variances for RMS scaling - ElemType *signs=m_pArray+n; // sign of previous gradient - ElemType *steps=m_pArray+2*n; // current step size - - assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3); - - if( !upd_gpu ) - { - ElemType upd[] = { - 2,2,0, - 2,2,0, - 1,1,1, - 2,2,0, - 1,2,1, - 0,2,2, - 1,1,1, - 0,2,2, - 0,2,2, - }; - - CUDA_CALL(cudaMalloc((void**)&upd_gpu,sizeof(ElemType)*27)); - CUDA_CALL(cudaMemcpy(upd_gpu,upd,sizeof(ElemType)*27,cudaMemcpyHostToDevice)); - } - - _rmsprop<<>>(avars,signs,steps,gradients.m_pArray,n, - RMS_GAMMA,RMS_WGT_INC,RMS_WGT_MAX,RMS_WGT_DEC,RMS_WGT_MIN, - floor,upd_gpu); - } - - template - void GPUMatrix::Reshape(const size_t numRows, const size_t numCols) - { - assert (numRows*numCols == GetNumElements()); - if (numRows*numCols != GetNumElements()) - throw std::invalid_argument("Reshape: total number of elements does not match."); - - m_numRows = numRows; - m_numCols = numCols; - } - - template - void GPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly) - { - if (m_numRows==numRows && m_numCols==numCols) - return; - - m_numRows = numRows; - m_numCols = numCols; - - size_t numElements = GetNumElements(); - if (numElements > m_elemSizeAllocated || (!growOnly && numElements != m_elemSizeAllocated)) - { - if (IsEmpty()) - { - m_elemSizeAllocated = 0; - m_pArray = NULL; - } - else - { - if (!OwnBuffer()) - throw std::invalid_argument("Can't resize a externally managed matrix"); - PrepareDevice(); - if (m_pArray!=NULL) - CUDA_CALL(cudaFree(m_pArray)); //delete and reallocate - m_elemSizeAllocated = numElements; - CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated)); - CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated)); - } - } - } - - template - size_t GPUMatrix::LocateElement (const size_t row, const size_t col) const - { - assert (row < m_numRows && col < m_numCols); - return col * m_numRows + row; // matrix in column-wise storage - } - - template - size_t GPUMatrix::LocateColumn (const size_t col) const - { - assert (col < m_numCols); - return col * m_numRows; // matrix in column-wise storage - } - - template - ElemType GPUMatrix::Get00Element() const - { - ElemType res=0; - CUDA_CALL(cudaMemcpy(&res,m_pArray,sizeof(ElemType),cudaMemcpyDeviceToHost)); - return res; - } -#pragma endregion Basic Operators - -#pragma region Member BLAS Functions - template - GPUMatrix& GPUMatrix::operator+= (ElemType alpha) - { - if (IsEmpty()) - throw std::logic_error("operator+=: Matrix is empty."); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addValue<<>>(m_pArray,alpha,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix GPUMatrix::operator+ (ElemType alpha) const - { - if (IsEmpty()) - throw std::logic_error("operator+: Matrix is empty."); - - const GPUMatrix& us=*this; - GPUMatrix c(us); - c+=alpha; - return c; - } - - template - GPUMatrix& GPUMatrix::AssignSumOf(const ElemType alpha, const GPUMatrix& a) - { - SetValue(a); - (*this)+=alpha; - return (*this); - } - - - template - GPUMatrix& GPUMatrix::operator+= (const GPUMatrix& a) - { - //if (a.GetNumElements()==1) - //{ - // //*this += a.Get00Element(); - // LONG64 N=(LONG64)GetNumElements(); - // int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - // cudaEvent_t done = nullptr; - // if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - // _addValue<<>>(m_pArray,a.m_pArray,N); - // if (do_sync) CUDA_CALL(cudaEventRecord(done)); - // if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - // if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - //} - //else - //{ - ScaleAndAdd(1, a, *this); - //} - return *this; - } - - template - GPUMatrix GPUMatrix::operator+ (const GPUMatrix& a) const - { - if (GetNumElements()==1) - { - GPUMatrix c(a); - c+=Get00Element(); - return c; - } - else if (a.GetNumElements()==1) - { - GPUMatrix c(*this); - c+=a.Get00Element(); - return c; - } - else - { - GPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code - c += a; - return c; - } - } - - template - GPUMatrix& GPUMatrix::AssignSumOf(const GPUMatrix& a, const GPUMatrix& b) - { - SetValue(a); - (*this)+=b; - return (*this); - } - - template - GPUMatrix& GPUMatrix::operator-= (ElemType alpha) - { - if (IsEmpty()) - throw std::logic_error("operato-=: Matrix is empty."); - return operator+=(-1*alpha); - } - - template - GPUMatrix GPUMatrix::operator- (ElemType alpha) const - { - if (IsEmpty()) - throw std::logic_error("operator-: Matrix is empty."); - return operator+(-1*alpha); - } - - template - GPUMatrix& GPUMatrix::AssignDifferenceOf(const ElemType alpha, const GPUMatrix& a) - { - Resize(a.m_numRows,a.m_numCols); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignDifferenceOf1<<>>(m_pArray,alpha,a.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - /*Resize(a.m_numRows,a.m_numCols); - SetValue(alpha); - (*this)-=a; - return *this;*/ - } - - template - GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const ElemType alpha) - { - Resize(a.m_numRows,a.m_numCols); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignDifferenceOf2<<>>(m_pArray,alpha,a.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - /*SetValue(a); - (*this)-=alpha; - return *this;*/ - } - - template - GPUMatrix& GPUMatrix::operator-= (const GPUMatrix& a) - { - //if (a.GetNumElements() == 1) - // AssignDifferenceOf(*this, a.Get00Element()); - //else if (GetNumElements() == 1) - // AssignDifferenceOf(Get00Element(), a); - //else - ScaleAndAdd(-1, a, *this); - - return *this; - } - - template - GPUMatrix GPUMatrix::operator- (const GPUMatrix& a) const - { - GPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code - c -= a; - return c; - } - - template - GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const GPUMatrix& b) - { - if (this != &a) - { - Resize(a.GetNumRows(), a.GetNumCols()); - SetValue(a); - } - (*this) -= b; - return *this; - } - - template - GPUMatrix& GPUMatrix::operator*= (ElemType alpha) - { - Scale(alpha, *this); - return *this; - } - - template - GPUMatrix GPUMatrix::operator* (ElemType alpha) const - { - GPUMatrix c(GetNumRows(), GetNumCols()); - Scale(alpha, *this, c); - return c; - } - - template - GPUMatrix& GPUMatrix::AssignProductOf(const ElemType alpha, const GPUMatrix& a) - { - Scale(alpha, a, *this); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignProductOf (const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB) - { - if (a.GetNumElements() == 1) - { - if (transposeB) - AssignTransposeOf(b); - (*this) *= a.Get00Element(); - } - else if (b.GetNumElements() == 1) - { - if (transposeA) - AssignTransposeOf(a); - (*this) *= b.Get00Element(); - } - else - Multiply(a, transposeA, b, transposeB, *this); - return *this; - } - - template - GPUMatrix GPUMatrix::operator* (const GPUMatrix& a) const - { - const GPUMatrix& us = *this; - if (GetNumElements() == 1) - { - GPUMatrix c(GetComputeDeviceId()); - c.AssignProductOf(Get00Element(), a); - return c; - } - else if (a.GetNumElements() == 1) - { - GPUMatrix c(GetComputeDeviceId()); - c.AssignProductOf(a.Get00Element(), us); - return c; - } - else - { - GPUMatrix c(GetNumRows(),a.GetNumCols(),GetComputeDeviceId()); - Multiply(*this, a, c); - return c; - } - } - - template - GPUMatrix& GPUMatrix::operator/= (ElemType alpha) - { - (*this) *= 1/alpha; - return (*this); - } - - template - GPUMatrix GPUMatrix::operator/ (ElemType alpha) const - { - return ((*this) * (1/alpha)); - } - - //element-wise power - template - GPUMatrix& GPUMatrix::operator^= (ElemType alpha) - { - GPUMatrix& us = *this; - ElementWisePower(alpha, us, us); - return us; - } - - template - GPUMatrix GPUMatrix::operator^ (ElemType alpha) const - { - GPUMatrix c(GetNumRows(), GetNumCols()); - ElementWisePower(alpha, *this, c); - return c; - } - - template - GPUMatrix& GPUMatrix::AssignElementPowerOf(const GPUMatrix& a, const ElemType power) - { - ElementWisePower(power, a, *this); - return *this; - } - - - template - GPUMatrix& GPUMatrix::AddElementProductOf (const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AddElementProductOf: Matrix is empty."); - - assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - throw std::invalid_argument("The input matrix dimensions do not match."); - - if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols())) - throw std::invalid_argument("The input matrix dimensions do not match [this]."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addElementProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::ColumnElementMultiplyWith(const GPUMatrix& a) - { - if (a.IsEmpty() || IsEmpty()) - throw std::logic_error("ColumnElementMultiplyWith: Matrix is empty."); - - if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) - throw std::invalid_argument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows."); - - long N=(long)a.GetNumRows(); - long M=(long)GetNumCols(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _columnElementMultiplyWith<<>>(m_pArray,a.m_pArray,N,M); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::RowElementMultiplyWith(const GPUMatrix& a) - { - if (a.IsEmpty() || IsEmpty()) - throw std::logic_error("RowElementMultiplyWith: Matrix is empty."); - - if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) - throw std::invalid_argument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns."); - - long N = (long)GetNumRows(); - long M = (long)a.GetNumCols(); - int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _rowElementMultiplyWith<<>>(m_pArray,a.m_pArray,N,M); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::RowElementDivideBy(const GPUMatrix& a) - { - if (a.IsEmpty() || IsEmpty()) - throw std::logic_error("RowElementDivideBy: Matrix is empty."); - - if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) - throw std::invalid_argument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns."); - - long N = (long)GetNumRows(); - long M = (long)a.GetNumCols(); - int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _rowElementDivideBy << > >(m_pArray, a.m_pArray, N, M); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::ColumnElementDivideBy(const GPUMatrix& a) - { - if (a.IsEmpty() || IsEmpty()) - throw std::logic_error("ColumnElementDivideBy: Matrix is empty."); - - if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) - throw std::invalid_argument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows."); - - long N = (long)a.GetNumRows(); - long M = (long)GetNumCols(); - int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _ColumnElementDivideBy<<>>(m_pArray,a.m_pArray,N,M); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::ElementInverse () - { - if (IsEmpty()) - throw std::logic_error("ElementInverse: Matrix is empty."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _elemInverse<<>>(m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignElementInverseOf (const GPUMatrix& a) - { - SetValue(a); - return ElementInverse(); - } - - template - GPUMatrix& GPUMatrix::InplaceSigmoid() - { - performInplaceFunction(0); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignSigmoidOf (const GPUMatrix& a) - { - Resize(a.GetNumRows(),a.GetNumCols()); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignSigmoidOf<<>>(a.m_pArray,m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - /*SetValue(a); - InplaceSigmoid();*/ - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceSigmoidDerivative() - { - AssignSigmoidDerivativeOf(*this); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignSigmoidDerivativeOf (const GPUMatrix& a) - { - if (a.IsEmpty()) - throw std::logic_error("AssignSigmoidDerivativeOf: Matrix a is empty."); - - //auto& us=*this; - if (this != &a) - Resize(a.GetNumRows(), a.GetNumCols()); - - PrepareDevice(); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - - _assignSigmoidDerivative<<>>(a.m_pArray, m_pArray, N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - - template - GPUMatrix& GPUMatrix::InplaceTanh() - { - performInplaceFunction(1); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignTanhOf (const GPUMatrix& a) - { - SetValue(a); - InplaceTanh(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceLogSoftmax (const bool isColWise) - { - if (IsEmpty()) - throw std::logic_error("InplaceLogSoftmax: Matrix is empty."); - - PrepareDevice(); - if (isColWise) - { - long N=(long)GetNumCols(); //one kernel per column - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _logSoftMaxColWise<<>>(m_pArray,(long)m_numCols,(long)m_numRows); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - else - { - long N=(long)GetNumRows(); //one kernel per column - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _logSoftMaxRowWise<<>>(m_pArray,(long)m_numCols,(long)m_numRows); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignLogSoftmaxOf (const GPUMatrix& a, const bool isColWise) - { - Resize(a.GetNumRows(),a.GetNumCols()); - if (isColWise) - { - PrepareDevice(); - long N = (long)GetNumCols(); - long M = (long)GetNumRows(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignColumnwiseLogSoftmaxOf<<>>(a.m_pArray,m_pArray,N,M); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - else - { - NOT_IMPLEMENTED; - } - - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceSqrt() - { - performInplaceFunction(2); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignSqrtOf (const GPUMatrix& a) - { - SetValue(a); - InplaceSqrt(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceExp() - { - performInplaceFunction(3); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignExpOf (const GPUMatrix& a) - { - SetValue(a); - InplaceExp(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceLog() - { - performInplaceFunction(4); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignLogOf (const GPUMatrix& a) - { - SetValue(a); - InplaceLog(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceAbs() - { - performInplaceFunction(5); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignAbsOf (const GPUMatrix& a) - { - SetValue(a); - InplaceAbs(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceLinearRectifierDerivative() - { - performInplaceFunction(6); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignLinearRectifierDerivativeOf (const GPUMatrix& a) - { - SetValue(a); - InplaceLinearRectifierDerivative(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceCosine() - { - performInplaceFunction(7); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignCosineOf (const GPUMatrix& a) - { - SetValue(a); - InplaceCosine(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceNegativeSine() - { - performInplaceFunction(8); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignNegativeSineOf (const GPUMatrix& a) - { - SetValue(a); - InplaceNegativeSine(); - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceTruncateBottom (const ElemType threshold) - { - if (IsEmpty()) - throw std::logic_error("InplaceTruncateBottom: Matrix is empty."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateBottom<<>>(m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignTruncateBottomOf (const GPUMatrix& a, const ElemType threshold) - { - if (a.IsEmpty()) - throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty."); - - if (this!=&a) - { - Resize(a.GetNumRows(), a.GetNumCols()); - } - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignTruncateBottom<<>>(m_pArray,a.m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::InplaceTruncateTop (const ElemType threshold) - { - if (IsEmpty()) - throw std::logic_error("InplaceTruncateTop: Matrix is empty."); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateTop<<>>(m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignTruncateTopOf (const GPUMatrix& a, const ElemType threshold) - { - if (a.IsEmpty()) - throw std::logic_error("AssignTruncateTopOf: Matrix a is empty."); - - if (this!=&a) - { - Resize(a.GetNumRows(), a.GetNumCols()); - } - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignTruncateTop<<>>(m_pArray,a.m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - template - GPUMatrix& GPUMatrix::SetToZeroIfAbsLessThan (const ElemType threshold) - { - if (IsEmpty()) - throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty."); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setToZeroIfAbsLessThan<<>>(m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - ElemType GPUMatrix::SumOfAbsElements() const - { - if (IsEmpty()) - throw std::logic_error("SumOfAbsElements: Matrix is empty"); - - cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); - if (sizeof(ElemType)==sizeof(float)) - { - float res=0; - cublasSasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&res); - return res; - } - else - { - double res=0; - cublasDasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&res); - return ElemType(res); - } - } - - template - ElemType GPUMatrix::SumOfElements() const - { - if (IsEmpty()) - throw std::logic_error("SumOfElements: Matrix is empty"); - - PrepareDevice(); - ElemType* d_sum = NULL; - ElemType h_sum; - CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); - //WARNING: THIS kernel is not the most efficient way! - _reductionSum<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements()); - CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost)); - CUDA_CALL(cudaFree(d_sum)); - return h_sum; - } - - - template - GPUMatrix& GPUMatrix::AssignSumOfElements(const GPUMatrix& a) - { - if (a.IsEmpty()) - throw std::logic_error("AssignSumOfElements: Matrix a is empty"); - - Resize(1,1); - - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - //WARNING: THIS kernel is not the most efficient way! - _reductionSumAndAssign<<<1,1024>>>(m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)GetNumElements()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return (*this); - } - - template - DeviceBoundNumber GPUMatrix::Sum_AsDeviceBoundNum() const - { - if (IsEmpty()) - throw std::logic_error("Matrix is empty"); - PrepareDevice(); - ElemType* d_sum = NULL; - CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); - //WARNING: THIS kernel is not the most efficient way! - _reductionSum<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements()); - DeviceBoundNumber result; - result.ShallowCopyFrom(d_sum,GetComputeDeviceId()); - return result; - } - - template - ElemType GPUMatrix::Max() const - { - cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); - ElemType res; - if (sizeof(ElemType)==sizeof(float)) - { - int resInd=0; - cublasIsamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&resInd); - resInd--; - CUDA_CALL(cudaMemcpy(reinterpret_cast(&res),reinterpret_cast(m_pArray+resInd),sizeof(float),cudaMemcpyDeviceToHost)); - return res; - } - else - { - int resInd=0; - cublasIdamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&resInd); - resInd--; - CUDA_CALL(cudaMemcpy(reinterpret_cast(&res),m_pArray+resInd,sizeof(float),cudaMemcpyDeviceToHost)); - return res; - } - } - - - template - GPUMatrix& GPUMatrix::ElementMultiplyWith (const GPUMatrix& a) - { - if (IsEmpty() || a.IsEmpty()) - throw std::logic_error("ElementMultiplyWith: Matrix is empty."); - - GPUMatrix& us=*this; - assert (us.GetNumRows() == a.GetNumRows() && us.GetNumCols() == a.GetNumCols()); - if (us.GetNumRows() != a.GetNumRows() || us.GetNumCols() != a.GetNumCols()) - throw std::invalid_argument("The matrix dimensions do not match."); - - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _elemMul<<>>(m_pArray,a.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignElementProductOf (const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AssignElementProductOf: Matrix is empty."); - - assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - throw std::invalid_argument("The input matrix dimensions do not match."); - - Resize(a.GetNumRows(), a.GetNumCols()); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignElementProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::ElementDivideBy(const GPUMatrix& a) - { - return AssignElementDivisionOf(*this, a); - } - - template - GPUMatrix& GPUMatrix::AssignElementDivisionOf (const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AssignElementDivisionOf: Matrix is empty."); - - assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - throw std::invalid_argument("The input matrix dimensions do not match."); - - Resize(a.GetNumRows(), a.GetNumCols()); - LONG64 N=(LONG64)GetNumElements(); - int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignElementDivisionOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - bool GPUMatrix::IsEqualTo(const GPUMatrix& a, const ElemType threshold /*= 1e-8*/) const - { - return AreEqual(*this, a, threshold); - } - - template - void GPUMatrix::VectorNorm1(GPUMatrix& c, const bool isColWise) const - { - if (IsEmpty()) - throw std::logic_error("VectorNorm1: Matrix is empty."); - - const long n = (long)GetNumRows(); - const long m = (long)GetNumCols(); - assert (m>0 && n>0); //converting from size_t to int may cause overflow - - cudaEvent_t done = nullptr; - PrepareDevice(); - c.ChangeDeviceTo(GetComputeDeviceId()); - - int blocksPerGrid=0; - if (isColWise) //col-wise - { - c.Resize(1,m); - blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); - } - else - { - c.Resize(n, 1); - c.ChangeDeviceTo(GetComputeDeviceId()); - blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); - } - - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _vectorNorm1<<>>(c.m_pArray, m_pArray,n,m,isColWise); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - GPUMatrix& GPUMatrix::AssignVectorNorm1Of(GPUMatrix& a, const bool isColWise) - { - a.VectorNorm1(*this, isColWise); - return *this; - } - - template - void GPUMatrix::VectorNorm2(GPUMatrix& c, const bool isColWise) const - { - if (IsEmpty()) - throw std::logic_error("VectorNorm2: Matrix is empty."); - - const long n = (long)GetNumRows(); - const long m = (long)GetNumCols(); - assert (m>0 && n>0); //converting from size_t to int may cause overflow - - cudaEvent_t done = nullptr; - PrepareDevice(); - c.ChangeDeviceTo(GetComputeDeviceId()); - - int blocksPerGrid=0; - if (isColWise) //col-wise - { - c.Resize(1,m); - blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); - } - else - { - c.Resize(n, 1); - c.ChangeDeviceTo(GetComputeDeviceId()); - blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); - } - - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _vectorNorm2<<>>(c.m_pArray, m_pArray,n,m,isColWise); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - GPUMatrix& GPUMatrix::AssignVectorNorm2Of(GPUMatrix& a, const bool isColWise) - { - a.VectorNorm2(*this, isColWise); - return *this; - } - - template - void GPUMatrix::VectorNormInf(GPUMatrix& c, const bool isColWise) const - { - if (IsEmpty()) - throw std::logic_error("VectorMax: Matrix is empty."); - - //this implementation is not efficient - GPUMatrix tmp; - GPUMatrix tmp1; - tmp.AssignAbsOf((*this)); - tmp.VectorMax(tmp1,c,isColWise); - } - - template - GPUMatrix& GPUMatrix::AssignVectorNormInfOf(GPUMatrix& a, const bool isColWise) - { - a.VectorNormInf(*this, isColWise); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignInnerProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool isColWise) - { - InnerProduct (a, b, *this,isColWise); - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignKhatriRaoProductOf(const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AssignKhatriRaoProductOf: Matrix is empty."); - - long cols = a.GetNumCols(); - assert (cols == b.GetNumCols()); - if (!(cols == b.GetNumCols())) - throw std::invalid_argument("AssignKhatriRaoProductOf: The input matrix dimensions do not match."); - - long rowsA = (long)a.GetNumRows(); - long rowsB = (long)b.GetNumRows(); - Resize(rowsA * rowsB, cols); - float N=(float)GetNumElements(); - int blocksPerGrid =(int)ceil(N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignKhatriRaoProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient - // this = reshape each column of a from (K1xK2,1) to (K1, K2) - // if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames). - // the output is a (K1, frames) matrix - // if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames) - template - GPUMatrix& GPUMatrix::AddColumnReshapeProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool transposeAColumn) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AddColumnReshapeProductOf: Matrix is empty."); - - long cols = a.GetNumCols(); - assert (cols == b.GetNumCols()); - if (!(cols == b.GetNumCols())) - throw std::invalid_argument("AddColumnReshapeProductOf: The input matrix dimensions do not match."); - - long rowsA = (long)a.GetNumRows(); - long rowsB = (long)b.GetNumRows(); - if (rowsA % rowsB != 0) - throw std::invalid_argument("AddColumnReshapeProductOf: number of rows in a should be multiples of that in b."); - - long rowsC = rowsA / rowsB; - if (rowsC != GetNumRows() || cols != GetNumCols()) - throw std::invalid_argument("AddColumnReshapeProductOf: This matrix does not have the right size."); - - float N=(float)GetNumElements(); - int blocksPerGrid =(int)ceil(N/threadsPerBlock); - a.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addColumnReshapeProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::AddWithScaleOf(ElemType alpha, const GPUMatrix& a) - { - ScaleAndAdd(alpha, a, *this); - return *this; - } - - template - ElemType GPUMatrix::FrobeniusNorm() const - { - if (IsEmpty()) - throw std::logic_error("FrobeniusNorm: Matrix is empty."); - - PrepareDevice(); - ElemType* d_sum = NULL; - ElemType h_sum=0; - CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); - //WARNING: THIS kernel is not the most efficient way! - _reductionSum2<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements(), true); - CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost)); - CUDA_CALL(cudaFree(d_sum)); - - return (h_sum); - } - - template - GPUMatrix& GPUMatrix::AssignFrobeniusNormOf (const GPUMatrix& a) - { - if (a.IsEmpty()) - throw std::logic_error("AssignFrobeniusNormOf: Matrix a is empty."); - - Resize(1,1); - - PrepareDevice(); - //WARNING: THIS kernel is not the most efficient way! - _reductionSum2<<<1,1024,0,t_stream>>>(a.m_pArray,m_pArray,(LONG64)a.GetNumElements(), true); - - return *this; - } - - template - ElemType GPUMatrix::MatrixNormInf() const - { - if (IsEmpty()) - throw std::logic_error("MatrixNorm1: Matrix is empty."); - - PrepareDevice(); - ElemType* d_maxAbs = NULL; - ElemType h_maxAbs=0; - CUDA_CALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType))); - //WARNING: THIS kernel is not the most efficient way! - _reductionMatrixNormInf<<<1,1024,0,t_stream>>>(m_pArray,d_maxAbs,(LONG64)GetNumElements()); - CUDA_CALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost)); - CUDA_CALL(cudaFree(d_maxAbs)); - return h_maxAbs; - } - - template - ElemType GPUMatrix::MatrixNorm1() const - { - if (IsEmpty()) - throw std::logic_error("MatrixNorm1: Matrix is empty."); - return SumOfAbsElements(); - } - - template - ElemType GPUMatrix::MatrixNorm0() const - { - if (IsEmpty()) - throw std::logic_error("MatrixNorm0: Matrix is empty."); - - PrepareDevice(); - ElemType* d_nz = NULL; - ElemType h_nz=0; - CUDA_CALL(cudaMalloc((void**)&d_nz,sizeof(ElemType))); - //WARNING: THIS kernel is not the most efficient way! - _reductionMatrixNorm0<<<1,1024,0,t_stream>>>(m_pArray,d_nz,(LONG64)GetNumElements()); - CUDA_CALL(cudaMemcpy(&h_nz,d_nz,sizeof(ElemType),cudaMemcpyDeviceToHost)); - CUDA_CALL(cudaFree(d_nz)); - return h_nz; - } - - template - GPUMatrix& GPUMatrix::AssignSignOf(const GPUMatrix& a) - { - if (a.IsEmpty()) - throw std::logic_error("AssignSignOf: Matrix a is empty."); - - if (this != &a) - Resize(a.GetNumRows(), a.GetNumCols()); - - PrepareDevice(); - cudaEvent_t done = nullptr; - int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignSignOf<<>>(m_pArray, a.m_pArray, (long)GetNumElements()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - GPUMatrix& GPUMatrix::AddSignOf(const GPUMatrix& a) - { - if (a.IsEmpty()) - throw std::logic_error("AddSignOf: Matrix a is empty."); - - if (this != &a) - Resize(a.GetNumRows(), a.GetNumCols()); - - PrepareDevice(); - cudaEvent_t done = nullptr; - int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addSignOf<<>>(m_pArray, a.m_pArray, (LONG64)GetNumElements()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - - template - void GPUMatrix::VectorMax(GPUMatrix& maxIndexes, GPUMatrix& maxValues, const bool isColWise) const - { - if (IsEmpty()) - throw std::logic_error("VectorMax: Matrix is empty."); - - const GPUMatrix& us=*this; - const long m = (long)GetNumRows(); - const long n = (long)GetNumCols(); - assert (m>0 && n>0); //converting from size_t to int may cause overflow - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - if (isColWise) - { - maxValues.Resize(1, n); - maxIndexes.Resize(1, n); - - int blocksPerGrid = n; //we'll have 1 block processing 1 column - _vectorMaxMinReduce<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,true); - - /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - _vectorMax<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);*/ - } - else - { - maxValues.Resize(m, 1); - maxIndexes.Resize(m, 1); - int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock); - _vectorMax<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise); - } - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::VectorMin(GPUMatrix& minIndexes, GPUMatrix& minValues, const bool isColWise) const - { - if (IsEmpty()) - throw std::logic_error("VectorMax: Matrix is empty."); - - const GPUMatrix& us=*this; - const int m = (int)GetNumRows(); - const int n = (int)GetNumCols(); - - assert (m>0 && n>0); //converting from size_t to int may cause overflow - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - if (isColWise) - { - minValues.Resize(1, n); - minIndexes.Resize(1, n); - - int blocksPerGrid = n; //we'll have 1 block processing 1 column - _vectorMaxMinReduce<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,false); - - /* - int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - _vectorMin<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);*/ - } - else - { - minValues.Resize(m, 1); - minIndexes.Resize(m, 1); - int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock); - _vectorMin<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise); - } - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - GPUMatrix& GPUMatrix::AssignNumOfDiff(const GPUMatrix& a, const GPUMatrix& b) - { - if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) - throw std::invalid_argument ("AssignNumOfDiff: a and b must have same dimension."); - - Resize(1,1); //result should be one element - - PrepareDevice(); - cudaEvent_t done = nullptr; - //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - //_assignNumOfDiff<<>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements()); - _assignNumOfDiff<<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (LONG64)a.GetNumElements()); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; - } - -#pragma endregion Member BLAS Functions - -#pragma region Other helper functions - template - void GPUMatrix::Print(const char* /*matrixName*/, size_t /*rowStart*/, size_t /*rowEnd*/, size_t /*colStart*/, size_t /*colEnd*/) const - { - NOT_IMPLEMENTED; - } - - template - void GPUMatrix::Print(const char* matrixName /*=nullptr*/) const - { - Print(matrixName, 0, GetNumRows()-1, 0, GetNumCols()-1); - } - - // file I/O - //matrixName is used to verify that correct matrix is read. - template - void GPUMatrix::ReadFromFile(FILE*, const char * /*matrixName*/) - { - NOT_IMPLEMENTED; - } - - //matrixName is used to verify that correct matrix is read. - template - void GPUMatrix::WriteToFile(FILE*, const char * /*matrixName*/) - { - NOT_IMPLEMENTED; - } - - //helpfer function used for convolution neural network - template - GPUMatrix& GPUMatrix::AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, - const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, - const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, - const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, - const bool zeroPadding) - { - assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); - - size_t packedInputRows = kernelWidth * kernelHeight * inputChannels; - size_t packedInputColsPerSample = outputWidth * outputHeight; - size_t smallBatchSize = inputSubBatch.GetNumCols(); - Resize(packedInputRows, packedInputColsPerSample * smallBatchSize); - if (zeroPadding) - SetValue((ElemType)0); - - PrepareDevice(); - int numThreadPerBlock = threadsPerBlock; -#if 1 - int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; -#else - dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize); -#endif - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignPackedConvolutionInput<<>>(m_pArray, - inputSubBatch.m_pArray, - smallBatchSize, - inputWidth, inputHeight, inputChannels, - outputWidth, outputHeight, outputChannels, - kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - //helpfer function used for convolution neural network - template - GPUMatrix& GPUMatrix::UnpackConvolutionInput(GPUMatrix& inputSubBatch, - const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, - const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, - const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, - const bool zeroPadding) const - { - assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); - - size_t smallBatchSize = inputSubBatch.GetNumCols(); - - PrepareDevice(); - int numThreadPerBlock = threadsPerBlock; -#if 1 - int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; -#else - dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize); -#endif - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _unpackConvolutionInput<<>>(m_pArray, - inputSubBatch.m_pArray, - smallBatchSize, - inputWidth, inputHeight, inputChannels, - outputWidth, outputHeight, outputChannels, - kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return inputSubBatch; - } - - template - GPUMatrix& GPUMatrix::AssignMaxPoolingResult(const GPUMatrix& inputBatch, const size_t channels, - const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, - const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, - const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) - { - assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); - - unsigned int batchSize = inputBatch.GetNumCols(); - Resize(outputSizePerSample, batchSize); - - int numThreadPerBlock = threadsPerBlock; - int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; - - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignMaxPoolingResult<<>>(m_pArray, inputBatch.m_pArray, batchSize, channels, - inputWidth, inputHeight,inputSizePerSample, - outputWidth, outputHeight, outputSizePerSample, - windowWidth, windowHeight, horizontalSubsample, verticalSubsample); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::AddMaxPoolingGradient(const GPUMatrix& outputGradientBatch, const GPUMatrix& inputBatch, const GPUMatrix& outputBatch, - const size_t channels, - const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, - const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, - const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) - { - assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); - - unsigned int batchSize = outputGradientBatch.GetNumCols(); - int numThreadPerBlock = threadsPerBlock; - - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - - int blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; - _addMaxPoolingGradient<<>>(m_pArray, outputGradientBatch.m_pArray, inputBatch.m_pArray, outputBatch.m_pArray, batchSize, channels, - inputWidth, inputHeight,inputSizePerSample, - outputWidth, outputHeight, outputSizePerSample, - windowWidth, windowHeight, horizontalSubsample, verticalSubsample); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::AssignAveragePoolingResult(const GPUMatrix& inputBatch, const size_t channels, - const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, - const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, - const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) - { - assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); - - unsigned int batchSize = inputBatch.GetNumCols(); - Resize(outputSizePerSample, batchSize); - - int numThreadPerBlock = threadsPerBlock; - int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; - - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignAveragePoolingResult<<>>(m_pArray, inputBatch.m_pArray, batchSize, channels, - inputWidth, inputHeight,inputSizePerSample, - outputWidth, outputHeight, outputSizePerSample, - windowWidth, windowHeight, horizontalSubsample, verticalSubsample); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - - template - GPUMatrix& GPUMatrix::AddAveragePoolingGradient(const GPUMatrix& outputGradientBatch, - const size_t channels, - const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, - const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, - const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) - { - assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); - - size_t batchSize = outputGradientBatch.GetNumCols(); - int numThreadPerBlock = threadsPerBlock; - - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - - size_t blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; - _addAveragePoolingGradient<<>>(m_pArray, outputGradientBatch.m_pArray, (long)batchSize, channels, - inputWidth, inputHeight,inputSizePerSample, - outputWidth, outputHeight, outputSizePerSample, - windowWidth, windowHeight, horizontalSubsample, verticalSubsample); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - - return *this; - } - -#pragma endregion Other helper functions - -#pragma region Static BLAS Functions - template - void GPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, - ElemType beta, GPUMatrix& c) - { - a.PrepareDevice(); - if ((a.GetComputeDeviceId()!=b.GetComputeDeviceId()) || (b.GetComputeDeviceId()!=c.GetComputeDeviceId())) //different GPUs - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - cublasHandle_t cuHandle = GetCublasHandle(b.GetComputeDeviceId()); - cublasOperation_t transA = transposeA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t transB = transposeB ? CUBLAS_OP_T : CUBLAS_OP_N; - int m = int(transposeA ? a.m_numCols : a.m_numRows); - int n = int(transposeB ? b.m_numRows : b.m_numCols); - int k = int(transposeA ? a.m_numRows : a.m_numCols); - int l = int(transposeB ? b.m_numCols : b.m_numRows); - c.Resize(m,n); - - if (!(m>0 && k>0 && l>0 && n>0)) - { - throw std::runtime_error("!(m>0 && k>0 && l>0 && n>0)"); //converting from size_t to int may cause overflow - } - if (k!=l) - { - throw std::runtime_error("matrix dim mismatch in MultiplyAndWeightedAdd"); - } - if (sizeof(ElemType)==sizeof(float)) - { - CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(b.m_pArray),(int)b.m_numRows,reinterpret_cast(&beta),reinterpret_cast(c.m_pArray),(int)c.m_numRows)); - } - else if (sizeof(ElemType)==sizeof(double)) - { - CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(b.m_pArray),(int)b.m_numRows,reinterpret_cast(&beta),reinterpret_cast(c.m_pArray),(int)c.m_numRows)); - } - else - { - throw std::runtime_error("Unsupported template argument in GPUMatrix"); - } - c.m_numRows=m; - c.m_numCols=n; - } - } - - template - void GPUMatrix::MultiplyAndAdd(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) - { - return GPUMatrix::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 1, c); - } - - template - void GPUMatrix::Multiply(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) - { - return GPUMatrix::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 0, c); - } - - template - void GPUMatrix::Multiply(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) - { - return GPUMatrix::MultiplyAndWeightedAdd(1, a, false, b, false, 0, c); - } - - /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c - /// if a is a column vector, add to all columns of c - /// if a is a row vector, add to all rows of c - /// if a is a scalar, add to all elements of c - /// Scalar - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - void GPUMatrix::ScaleAndAdd(ElemType alpha,const GPUMatrix& a, GPUMatrix& c) - { - if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - a.PrepareDevice(); - if (a.IsEmpty() || c.IsEmpty()) - throw std::logic_error("ScaleAndAdd: one of the input matrices is empty."); - //if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector - if (a.GetNumRows()==c.GetNumRows() && a.GetNumCols()==c.GetNumCols()) // dimensions match - { - const int m = (int)a.GetNumRows(); - const int n = (int)a.GetNumCols(); - const int len = m * n; - const int incx = 1; - const int incy = 1; - - assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow - assert ((int)c.GetNumRows() == m && (int)c.GetNumCols() == n); - if ((int)c.GetNumRows() != m || (int)c.GetNumCols() != n) - throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a."); - - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - if (sizeof(ElemType) == sizeof(float)) - { - CUBLAS_CALL(cublasSaxpy(cuHandle,len,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),incx,reinterpret_cast (c.m_pArray) ,incy)); - } - else if (sizeof(ElemType) == sizeof(double)) - { - CUBLAS_CALL(cublasDaxpy(cuHandle,len,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),incx,reinterpret_cast (c.m_pArray) ,incy)); - } - else - { - throw std::runtime_error("Unsupported template argument in GPUMatrix"); - } - } - else if (a.GetNumElements() == 1) - { - LONG64 N=(LONG64)c.GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - c.PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _scaleAndAddScalar<<>>(c.m_pArray, N, alpha, a.m_pArray); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - else if (a.GetNumCols() == 1) //col vector, add it to all columns - { - long m = (long)c.GetNumRows(); - long n = (long)c.GetNumCols(); - if (m != (long)a.GetNumRows()) - throw std::invalid_argument("To add column vector, rows should match."); - - cudaEvent_t done = nullptr; - int blocksPerGrid = (int)(ceil(1.0*m*n / threadsPerBlock)); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); -#ifdef VALIDATION - printf(">>>> CUDA compute device is %d\n", a.GetComputeDeviceId()); - printf(">>>> a.m_pArray = %p, c.m_pArray = %p, alpha = %f, m = %ld, n = %ld\n", a.m_pArray,c.m_pArray,alpha,m,n); - for (int i=0; i < 2; i++) - { - ElemType buffer[10] = {-1.234f}; - cudaError_t error = cudaMemcpy(buffer, !i?a.m_pArray:c.m_pArray, sizeof(buffer), cudaMemcpyKind::cudaMemcpyDeviceToHost); - if (error == cudaError::cudaSuccess) - printf("buffer valid\n"); - } -#endif - - _matrixVectorColumnWiseAddWithThreadPerElem<<>>(a.m_pArray,c.m_pArray,alpha,m,n); - - - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - else if (a.GetNumRows()==1) //row vector, add it to all rows - { - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - int m = (int)c.GetNumRows(); - int n = (int)c.GetNumCols(); - assert (n == (int)a.GetNumCols()); - if (n != (int)a.GetNumCols()) - throw std::invalid_argument("To add row vector, cols should match."); - - if (sizeof(ElemType) == sizeof(double)) - { - foreach_row(i,c) - { - CUBLAS_CALL(cublasDaxpy(cuHandle,n,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),1,reinterpret_cast (c.m_pArray+i),m)); - } - } - else - { - foreach_row(i,c) - { - CUBLAS_CALL(cublasSaxpy(cuHandle,n,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),1,reinterpret_cast (c.m_pArray+i),m)); - } - } - } - else - throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a."); - } - } - - /// c += alpha * (a-b) - /// if a, b, c must have same dim - /// Scalar - /// Input matrix - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - void GPUMatrix::AddScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) - { - if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - a.PrepareDevice(); - - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && - a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()); - - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && - a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols())) - { - throw std::invalid_argument("AddScaledDifference: a, b, and c must have same dimension."); - } - - if (a.IsEmpty()) - throw std::logic_error("AddScaledDifference: Input matrix a is empty."); - - cudaEvent_t done = nullptr; - LONG64 n=(LONG64)a.GetNumElements(); - int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - /// c = alpha * (a-b) - /// if a, b, c must have same dim - /// Scalar - /// Input matrix - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - void GPUMatrix::AssignScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) - { - if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - a.PrepareDevice(); - - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() ); - - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - { - throw std::invalid_argument("AssignScaledDifference: a, b must have same dimension."); - } - - if (a.IsEmpty()) - throw std::logic_error("AssignScaledDifference: Input matrix a is empty."); - - if (&c != &a && &c != &b) - c.Resize(a.GetNumRows(), a.GetNumCols()); - - cudaEvent_t done = nullptr; - LONG64 n=(LONG64)a.GetNumElements(); - int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - /// c += alpha * (a-b) - /// if a, b, c must have same dim - /// 1X1 matrix - /// Input matrix - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - void GPUMatrix::AddScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) - { - assert(alpha.GetNumElements() == 1); - if (!(alpha.GetNumElements() == 1)) - throw std::invalid_argument("AddScaledDifference: alpha must be a 1X1 matrix."); - - if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - a.PrepareDevice(); - - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && - a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()); - - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && - a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols())) - { - throw std::invalid_argument("AddScaledDifference: a, b, and c must have same dimension."); - } - - if (a.IsEmpty()) - throw std::logic_error("AddScaledDifference: Input matrix a is empty."); - - cudaEvent_t done = nullptr; - LONG64 n=(LONG64)a.GetNumElements(); - int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - /// c = alpha * (a-b) - /// if a, b, c must have same dim - /// Scalar - /// Input matrix - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - void GPUMatrix::AssignScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) - { - assert(alpha.GetNumElements() == 1); - if (!(alpha.GetNumElements() == 1)) - throw std::invalid_argument("AddScaledDifference: alpha must be a 1X1 matrix."); - - if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - a.PrepareDevice(); - - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() ); - - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - { - throw std::invalid_argument("AssignScaledDifference: a, b must have same dimension."); - } - - if (a.IsEmpty()) - throw std::logic_error("AssignScaledDifference: Input matrix a is empty."); - - c.Resize(a.GetNumRows(), a.GetNumCols()); - - cudaEvent_t done = nullptr; - LONG64 n=(LONG64)a.GetNumElements(); - int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - //c[ci,cj] += a[ai,aj] - template - void GPUMatrix::AddElementToElement(const GPUMatrix& a, const size_t ai, const size_t aj, GPUMatrix& c, const size_t ci, const size_t cj) - { - if (ai >= a.GetNumRows() || aj >=a.GetNumCols() || - ci >= c.GetNumRows() || cj >=c.GetNumCols()) - throw std::invalid_argument("AddElementToElement: index out of range."); - - a.PrepareDevice(); - cudaEvent_t done = nullptr; - int blocksPerGrid=1; //only one element - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addElementToElement<<>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj)); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - - template - void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) - { - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - if (sizeof(ElemType)==sizeof(float)) - { - float alph = (float)alpha; - CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(float*)a.m_pArray,1)); - } - else if (sizeof(ElemType)==sizeof(double)) - { - double alph = alpha; - CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(double*)a.m_pArray,1)); - } - else - { - throw std::runtime_error("Unsupported template argument in GPUMatrix"); - } - } - - - template - void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& a) - { - if (alpha.GetNumElements()!=1) - { - throw std::runtime_error("Matrix alpha must be 1x1"); - } - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); - if (sizeof(ElemType)==sizeof(float)) - { - CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),(float*)alpha.m_pArray,(float*)a.m_pArray,1)); - } - else if (sizeof(ElemType)==sizeof(double)) - { - CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),(double*)alpha.m_pArray,(double*)a.m_pArray,1)); - } - else - { - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); - throw std::runtime_error("Unsupported template argument in GPUMatrix"); - } - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); - } - - template //c = alpha * a - void GPUMatrix::Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) - { - if (a.IsEmpty()) - throw std::logic_error("Scale: Input matrix a is empty."); - - c=a; - Scale(alpha,c); - } - - - template - void GPUMatrix::InnerProduct (const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const bool isColWise) - { - if (a.GetComputeDeviceId()!=b.GetComputeDeviceId() || b.GetComputeDeviceId()!=c.GetComputeDeviceId()) //different GPUs - throw std::invalid_argument("All matrices must be on the same GPU"); - - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("Scale: one of the input matrices is empty."); - - const int m = (int)a.GetNumRows(); - const int n = (int)a.GetNumCols(); - const int k = (int)b.GetNumRows(); - const int l = (int)b.GetNumCols(); - - assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow - assert (m==k && n==l); //converting from size_t to int may cause overflow - if (m!=k || n!=l) - throw std::invalid_argument("Matrices a and b should have same dimension."); - - if (isColWise) - c.Resize(1,n); - else - c.Resize(m,1); - - if ((isColWise && m == 1) || !isColWise && n == 1) //in this case it's equivalent to element-wise product - { - c.AssignElementProductOf(a, b); - } - else - { - cudaEvent_t done = nullptr; - c.PrepareDevice(); - - int blocksPerGrid=0; - if (isColWise) //col-wise - { - c.Resize(1,n); - blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); - } - else - { - c.Resize(m, 1); - blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); - } - - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _innerProduct<<>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - template - ElemType GPUMatrix::InnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("InnerProductOfMatrices: one of the input matrices is empty."); - - const int m = (int)a.GetNumRows(); - const int n = (int)a.GetNumCols(); - const int k = (int)b.GetNumRows(); - const int l = (int)b.GetNumCols(); - - assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow - assert (m==k && n==l); //converting from size_t to int may cause overflow - if (m!=k || n!=l) - throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); - - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - if (sizeof(ElemType) == sizeof(double)) - { - double tmp=0; - CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,&tmp)); - return ElemType(tmp); - //return (ElemType)ddot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); - } - else - { - float tmp=0; - CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,&tmp)); - return tmp; - //return (ElemType)sdot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); - } - } - - - template - GPUMatrix& GPUMatrix::AssignInnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("InnerProductOfMatrices: one of the input matrices is empty."); - - Resize(1,1); - - const int m = (int)a.GetNumRows(); - const int n = (int)a.GetNumCols(); - const int k = (int)b.GetNumRows(); - const int l = (int)b.GetNumCols(); - - assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow - assert (m==k && n==l); //converting from size_t to int may cause overflow - if (m!=k || n!=l) - throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); - - cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); - if (sizeof(ElemType) == sizeof(double)) - { - CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,reinterpret_cast (m_pArray))); - } - else - { - CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,reinterpret_cast (m_pArray))); - } - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); - return *this; - } - - - template - void GPUMatrix::ElementWisePower(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) - { - if (a.GetComputeDeviceId() != c.GetComputeDeviceId()) - { - throw std::invalid_argument("All matrices must be on the same GPU"); - } - else - { - if (a.IsEmpty()) - throw std::logic_error("ElementWisePower: The input matrix a is empty."); - if (a.GetNumRows()!=c.GetNumRows() || a.GetNumCols()!=c.GetNumCols()) - throw std::logic_error("ElementWisePower: matrices must be of the same size"); - - cudaEvent_t done = nullptr; - a.PrepareDevice(); - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - LONG64 N=(LONG64)a.GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - _elementWisePowerOnCuda<<>>(alpha,a.m_pArray,c.m_pArray,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - } - } - - template - bool GPUMatrix::AreEqual(const GPUMatrix& a, const GPUMatrix& b, const ElemType threshold /*= 1e-8*/) - { - if (a.IsEmpty() || b.IsEmpty()) - throw std::logic_error("AreEqual: one of the input matrices is empty."); - - if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) - return false; - - a.PrepareDevice(); - long *res = new long[1]; - res[0]=1; - long *d_res = NULL; - CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(long)*1)); - CUDA_CALL(cudaMemcpy(d_res,res,sizeof(long)*1,cudaMemcpyHostToDevice)); - long N=(long)a.GetNumElements(); - int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - _areEqual<<>>(a.m_pArray,b.m_pArray,N,threshold,d_res); - CUDA_CALL(cudaMemcpy(res,d_res,sizeof(long)*1,cudaMemcpyDeviceToHost)); - if (res[0]!=0) - return true; - else - return false; - } - - template - GPUMatrix GPUMatrix::Ones(const size_t rows, const size_t cols) - { - GPUMatrix c(rows, cols); //will initialize to 0 - c.SetValue(1); - return c; - } - - template - GPUMatrix GPUMatrix::Zeros(const size_t rows, const size_t cols) - { - GPUMatrix c(rows, cols); //will initialize to 0 - //c.SetValue(0); - return c; - } - - template - GPUMatrix GPUMatrix::Eye(const size_t rows) - { - GPUMatrix c(rows, rows); //will initialize to 0 - c.SetDiagonalValue(1); - return c; - } - - template - GPUMatrix GPUMatrix::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed) - { - GPUMatrix c(rows, cols); //will initialize to 0 - c.SetUniformRandomValue(low, high, seed); - return c; - } - - template - GPUMatrix GPUMatrix::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed) - { - GPUMatrix c(rows, cols); //will initialize to 0 - c.SetGaussianRandomValue(mean, sigma, seed); - return c; - } - - template - ElemType GPUMatrix::GetLearnRateForBlock_Helper(const GPUMatrix &Gradients, const GPUMatrix &SmoothedGradients) - { - Gradients.PrepareDevice(); - ElemType* d_res=NULL; - CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(ElemType))); //we allocate memory on the device - - //Compute inner product of matrices and keep it on device - const int m = (int)Gradients.GetNumRows(); - const int n = (int)Gradients.GetNumCols(); - const int k = (int)SmoothedGradients.GetNumRows(); - const int l = (int)SmoothedGradients.GetNumCols(); - assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow - assert (m==k && n==l); //converting from size_t to int may cause overflow - if (m!=k || n!=l) throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); - - if (sizeof(ElemType) == sizeof(double)) - { - cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId()); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); - CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (Gradients.m_pArray), 1, reinterpret_cast (SmoothedGradients.m_pArray), 1,reinterpret_cast (d_res))); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); - } - else - { - cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId()); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); - CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (Gradients.m_pArray), 1, reinterpret_cast (SmoothedGradients.m_pArray), 1,reinterpret_cast (d_res))); - cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); - } - // d_res[0] should now contain inner product of matrices - // Compute squared Frobenius norms (squared sums of elements) - _lrHelper<<<1,512,0,t_stream>>>(Gradients.m_pArray,SmoothedGradients.m_pArray, (LONG64)Gradients.GetNumElements(), d_res); - ElemType res; - CUDA_CALL(cudaMemcpy(&res,d_res,sizeof(ElemType),cudaMemcpyDeviceToHost)); - CUDA_CALL(cudaFree(d_res)); - return res; - } - -#pragma endregion Static BLAS Functions - - - //#pragma region File << and >> operators - // template - // File& operator>>(File& stream, GPUMatrix &us) - // { - // //auto& us = *this; - // - // stream.GetMarker(fileMarkerBeginSection, std::string("BMAT")); - // size_t elsize; - // stream>>elsize; - // if (sizeof(ElemType)!=elsize) - // throw std::runtime_error("Template argument size doesn't match those in file"); - // std::wstring matrixName; - // size_t numRows, numCols; - // stream>>matrixName>>numRows>>numCols; - // ElemType* d_array = new ElemType[numRows*numCols]; - // for (long i=0;i>d_array[i]; - // stream.GetMarker(fileMarkerEndSection, std::string("EMAT")); - // us.SetValue(numRows,numCols,d_array, matrixFlagNormal); - // us.m_matrixName = matrixName; - // return stream; - // } - // - // template - // File& operator<<(File& stream, GPUMatrix &us) - // { - // //auto& us = *this; - // - // stream.PutMarker(fileMarkerBeginSection, std::string("BMAT")); - // stream<> operators - - template class GPUMatrix; - template class GPUMatrix; - template class DeviceBoundNumber; - template class DeviceBoundNumber; - - template - cublasHandle_t GPUMatrix::s_cuHandle[GPUMatrix::MaxGpus]={0}; - - template - void* GPUMatrix::s_curandGenerator=NULL; -}}} - -// !!!!This is from helper_cuda.h which comes with CUDA samples!!!! Consider if it is beneficial to just include all helper_cuda.h -// TODO: This is duplicated in BestGpu.cpp -// Beginning of GPU Architecture definitions -int _ConvertSMVer2Cores(int major, int minor) -{ - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct - { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = - { - { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class - { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class - { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class - { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class - { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class - { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class - { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - { -1, -1 } - }; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) - { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) - { - return nGpuArchCoresPerSM[index].Cores; - } - - index++; - } - return nGpuArchCoresPerSM[7].Cores; -}; -// end of GPU Architecture definitions - -//inline long _GetFreeMemoryOnCUDADevice(int devId) -//{ -// CUdevice cudaDevice; -// CUresult result = cuDeviceGet(&cudaDevice, devId); -// if(result!= CUDA_SUCCESS) -// { -// return 0; -// } -// -// //create cuda context -// CUcontext cudaContext; -// result = cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice); -// if(result != CUDA_SUCCESS) -// { -// return 0; -// } -// -// //get the amount of free memory on the graphics card -// size_t free; -// size_t total; -// result = cuMemGetInfo(&free, &total); -// if (result!=CUDA_SUCCESS) -// { -// return 0; -// } -// else -// return (long)free; -//} - -#endif // CPUONLY +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// + +#include "stdafx.h" +#include "BestGpu.h" + +#ifndef CPUONLY + +#include "cublas_v2.h" +#include +#include +#include +#include +#include +#include "device_launch_parameters.h" +#include "GPUMatrix.h" +#include "GPUMatrixCUDAKernels.cu" +#include "GPUSparseMatrix.h" +#include // for cout + +#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs +#pragma comment (lib, "cublas.lib") +#pragma comment (lib, "cusparse.lib") +#pragma comment (lib, "curand.lib") + +#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<>> syntax if a and b are size_t +#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this +#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons + +#ifdef NO_SYNC +bool do_sync = false; +#else +bool do_sync = true; +#endif + +#ifdef _WIN32 +// thread local storage to access the current stream, initalize to default stream +__declspec (thread) +#endif +cudaStream_t t_stream = cudaStreamDefault; + +extern int _ConvertSMVer2Cores(int major, int minor); // forward declaration + +// SetStream - set the stream that will be used by the GPU routines +void MATH_API SetStream(cudaStream_t stream) +{ + t_stream = stream; +} + +// GetStream - get the stream that will be used by the GPU routines +cudaStream_t MATH_API GetStream() +{ + return t_stream; +} + + +void CURAND_CALL(curandStatus x) +{ + if(x!=CURAND_STATUS_SUCCESS) + { + throw std::runtime_error("CURAND fail"); + } +} + +void CUBLAS_CALL(cublasStatus_t x) +{ + if(x!=CUBLAS_STATUS_SUCCESS) + { + throw std::runtime_error("CUBLAS fail"); + } +} + +void CUDA_CALL(cudaError_t x) +{ + if(x!=cudaSuccess) + { + const char* errmsg = cudaGetErrorString(x); + std::cerr << "!!!!!!!!CUDA EXCEPTION: " << errmsg << std::endl; + cudaDeviceSynchronize(); + throw std::runtime_error(errmsg); + } +} + +namespace Microsoft { namespace MSR { namespace CNTK { + + // PrepareDevice - Setup the correct cuda context for an operation + // deviceId - the device on which the operation will take place + void PrepareDevice(DEVICEID_TYPE deviceId) + { + static DEVICEID_TYPE currentDevice = AUTOPLACEMATRIX; // set to anything valid + // externally managed matrices are guaranteed to be on the right device + if (deviceId == MANAGEDEXTERN) + return; + // and if we last set the device to be this device we are good + if (deviceId == currentDevice) + return; + CUDA_CALL(cudaSetDevice(deviceId)); + currentDevice=deviceId; + } + +#pragma region DeviceBoundNumber class + + template + DeviceBoundNumber::DeviceBoundNumber(const DeviceBoundNumber &/*deepCopy*/) + { + NOT_IMPLEMENTED; + } + + template + DeviceBoundNumber::DeviceBoundNumber(DeviceBoundNumber &&shallowCopy) + { + ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice); + shallowCopy.m_data=NULL; + } + + template + void DeviceBoundNumber::ShallowCopyFrom(ElemType* newVal,int newValsDevceId) + { + m_computeDevice = newValsDevceId; + m_data = newVal; + } + + template + DeviceBoundNumber::~DeviceBoundNumber() + { + if (m_data!=NULL) + { + if (m_computeDevice<0) + { + delete m_data; + m_data = NULL; + } + else if (m_computeDevice != MANAGEDEXTERN) + CUDA_CALL(cudaFree(m_data)); + } + } + +#pragma endregion DeviceBoundNumber class + +#pragma region Helper functions + template + cublasHandle_t _initCUBLAS(int devId) + { + PrepareDevice((DEVICEID_TYPE)devId); + cublasHandle_t cuHandle; + CUBLAS_CALL(cublasCreate(&cuHandle)); + return cuHandle; + } + + // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information + // TODO: should be replaced by BestGpu class instead, it's much better + template + DEVICEID_TYPE GPUMatrix::GetBestGPUDeviceId() //returns -1 if no GPUs can be used + { + // currently there is little point in giving out different device IDs each time ask for a matrix, + // we really want them all on the same device eventually + static int chosenDeviceId = AUTOPLACEMATRIX; + if (chosenDeviceId != AUTOPLACEMATRIX) + return chosenDeviceId; + + __try + { + // stash previous device state + // if there was one on entry: + int nPrevDev = -1; + cudaError_t ePrevDev = cudaGetDevice(&nPrevDev); + + int deviceCount = -1; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + if (error_id != cudaSuccess || deviceCount==0) + { + return -1; + } + + int setDev = -1; + int curDev=0; + long curPower = 0; + for (DEVICEID_TYPE dev = 0; dev < deviceCount; ++dev) + { + CUDA_CALL(cudaSetDevice(dev)); + setDev = dev; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + long power = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; + //long power = _GetFreeMemoryOnCUDADevice(dev); + if (power>curPower) + { + curPower=power; + curDev = dev; + } + } + + if(nPrevDev >= 0 && ePrevDev == cudaSuccess && + setDev >= 0 && setDev != nPrevDev) { + // restore current context to the one we entered with + // if there was one the caller might want unchanged. + cudaSetDevice(nPrevDev); + } + chosenDeviceId = curDev; + return curDev; + } + __except (1) + { + return -1; // CPU + } + } + + // PrepareDevice - Setup the correct cuda context for an operation + // deviceId - the device on which the operation will take place + // defaults to -1, which means use matrices current device + template + DEVICEID_TYPE GPUMatrix::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const + { + // if default value use current compute device + DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice; + + Microsoft::MSR::CNTK::PrepareDevice(newId); + return newId; + } + + template + ElemType* GPUMatrix::CopyToArray() const + { + size_t numElements = GetNumElements(); + if (numElements != 0) + { + PrepareDevice(); + ElemType* pArray = new ElemType[numElements]; + CUDA_CALL(cudaMemcpy(pArray,m_pArray,sizeof(ElemType)*m_numRows*m_numCols,cudaMemcpyDeviceToHost)); + return pArray; + } + else + { + return NULL; + } + } + + //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done + //return number of elements copied + template + size_t GPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const + { + size_t numElements = GetNumElements(); + + if (numElements > currentArraySize) + { + delete arrayCopyTo; + arrayCopyTo = new ElemType[numElements]; + currentArraySize = numElements; + } + + if (numElements != 0) + { + PrepareDevice(); + CUDA_CALL(cudaMemcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements, cudaMemcpyDeviceToHost)); + } + + return numElements; + } + + template + void GPUMatrix::ChangeDeviceTo(DEVICEID_TYPE to_id) + { + if (!OwnBuffer()) + throw std::logic_error("Cannot change device on Managed external matrix"); + if (to_id == CPUDEVICE) + throw std::logic_error("to_id must be valid GPU"); + if (m_computeDevice==to_id) + return; + + PrepareDevice((DEVICEID_TYPE)to_id); + ElemType* d_dst=NULL; + CUDA_CALL(cudaMalloc((void**)&d_dst,sizeof(ElemType)*m_numRows*m_numCols)); + + m_elemSizeAllocated = m_numRows*m_numCols; + + // check to make sure we have something to copy (on init we often have zero sized allocations) + if (m_elemSizeAllocated > 0) + { + // first try peer access + int canAccessPeer = false; + CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice)); + if (canAccessPeer) + { + CUDA_CALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0)); + CUDA_CALL(cudaMemcpyPeer(d_dst,to_id,m_pArray,m_computeDevice,sizeof(ElemType)*m_numRows*m_numCols)); + } + else + { + // peer access didn't work, just copy normal + // make this more efficient by keeping some buffers available for each copy + ElemType* h_dst=NULL; + PrepareDevice(); + CUDA_CALL(cudaMallocHost((void**)&h_dst,sizeof(ElemType)*m_numRows*m_numCols)); + CUDA_CALL(cudaMemcpy(h_dst,m_pArray,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyDeviceToHost)); + PrepareDevice((DEVICEID_TYPE)to_id); + CUDA_CALL(cudaMemcpy(d_dst,h_dst,sizeof(ElemType)*m_numRows*m_numCols, cudaMemcpyHostToDevice)); + CUDA_CALL(cudaFreeHost(h_dst)); + } + } + PrepareDevice(); + CUDA_CALL(cudaFree(m_pArray)); + m_pArray=d_dst; + + PrepareDevice((DEVICEID_TYPE)to_id); + m_computeDevice=to_id; + } + + template + void GPUMatrix::performInplaceFunction(int kind) + { + PrepareDevice(); + LONG64 N= (LONG64) GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + switch (kind) + { + case 0: + _inplaceSigmoidOnCuda<<>>(m_pArray, N); + break; + case 1: + _inplaceTanhOnCuda<<>>(m_pArray, N); + break; + case 2: + _inplaceSqrtOnCuda<<>>(m_pArray, N); + break; + case 3: + _inplaceExpOnCuda<<>>(m_pArray,N); + break; + case 4: + _inplaceLogOnCuda<<>>(m_pArray,N); + break; + case 5: + _inplaceAbsOnCuda<<>>(m_pArray,N); + break; + case 6: + _inplaceLinRectDerivative<<>>(m_pArray,N); + break; + case 7: + _inplaceCosineOnCuda<<>>(m_pArray,N); + break; + case 8: + _inplaceNegativeSineOnCuda<<>>(m_pArray,N); + break; + } + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + +#pragma endregion Helper functions + +#pragma region Constructors and Destructor + + //should only be used by constructors. + template + void GPUMatrix::ZeroInit(int deviceId) + { + m_computeDevice = deviceId; + m_pArray = nullptr; + m_numRows = 0; + m_numCols = 0; + m_elemSizeAllocated = 0; + m_matrixName=NULL; + m_format = matrixFormatDense; + m_externalBuffer = false; + } + + template + GPUMatrix::GPUMatrix(int deviceId) + { + if (deviceId == MANAGEDEXTERN) + throw std::logic_error("Basic constructor cannot be used with Managed Extern types"); + + ZeroInit(deviceId); + }; + + //matrixName is used to verify that correct matrix is read. + template + GPUMatrix::GPUMatrix(FILE* f, const char * matrixName, int deviceId) + { + if (deviceId == MANAGEDEXTERN) + throw std::logic_error("File constructor cannot be used with Managed Extern types"); + + ReadFromFile(f, matrixName); + } + + template + GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId) + { + if (deviceId == MANAGEDEXTERN) + throw std::logic_error("constructor cannot be used with Managed Extern types"); + ZeroInit(deviceId); + m_numRows = numRows; + m_numCols = numCols; + m_elemSizeAllocated = GetNumElements(); + + if (m_elemSizeAllocated != 0) + { + PrepareDevice(); + CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated)); + CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated)); + } + }; + + template + GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId) + { + ZeroInit(deviceId); + SetValue(numRows, numCols, pArray, matrixFlags, deviceId); + }; + + template + GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) + { + ZeroInit(deepCopyFrom.m_computeDevice); + SetValue(deepCopyFrom); + SetMatrixName(deepCopyFrom.m_matrixName); + } + + template + GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) + { + m_numRows = moveFrom.m_numRows; + m_numCols = moveFrom.m_numCols; + m_computeDevice = moveFrom.m_computeDevice; + m_pArray = moveFrom.m_pArray; //shallow copy the pointer + m_matrixName=moveFrom.m_matrixName; + m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + m_format = moveFrom.m_format; + m_externalBuffer = moveFrom.m_externalBuffer; + + //release the pointer from the source object so that the destructor won't release it twice + moveFrom.ZeroInit(0); + } + + //assignment operator, deep copy + template + GPUMatrix& GPUMatrix::operator=(const GPUMatrix& deepCopyFrom) + { + if (this != &deepCopyFrom) + { + SetValue(deepCopyFrom); + SetMatrixName(deepCopyFrom.m_matrixName); + } + return *this; + } + + //move assignment operator, shallow copy + template + GPUMatrix& GPUMatrix::operator=(GPUMatrix&& moveFrom) + { + if (this != &moveFrom) + { + if (OwnBuffer() && m_pArray!=NULL) + { + CUDA_CALL(cudaFree(m_pArray)); + } + + m_numRows = moveFrom.m_numRows; + m_numCols = moveFrom.m_numCols; + m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + m_pArray = moveFrom.m_pArray; + m_computeDevice = moveFrom.m_computeDevice; + m_format = moveFrom.m_format; + m_externalBuffer = moveFrom.m_externalBuffer; + + //release the pointer from the source object so that the destructor won't release it twice + moveFrom.ZeroInit(0); + } + return *this; + } + + template + GPUMatrix::~GPUMatrix(void) + { + Clear(); + } + + template + void GPUMatrix::Clear() + { + if (OwnBuffer() && m_pArray!=NULL) + { + if (m_computeDevice>=0) + { + PrepareDevice(); + cudaFree(m_pArray); + m_pArray = NULL; + m_elemSizeAllocated = 0; + } + } + BaseMatrix::Clear(); + + ZeroInit(m_computeDevice); + } +#pragma endregion Constructors and Destructor + + template + int GPUMatrix::GetComputeDeviceId() const + { + // for externally managed memory the CUDA context will have the current device + if (m_computeDevice == MANAGEDEXTERN) + { + int devId; + assert(m_externalBuffer); + CUDA_CALL(cudaGetDevice(&devId)); + return devId; + } + return m_computeDevice; + } + +#pragma region Basic Operators + template + GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const + { + if (numCols == 0) + throw std::logic_error("The slice cannot have 0 columns."); + + if (startColumn + numCols > m_numCols) + throw std::logic_error("The slice is out of range of the source matrix."); + + GPUMatrix slice(m_numRows, numCols, m_pArray + startColumn * m_numRows, matrixFlagDontOwnBuffer, m_computeDevice); + + return slice; + } + + template + GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) + { + if (numCols == 0) + throw std::logic_error("The slice cannot have 0 columns."); + + if (startColumn + numCols > m_numCols) + throw std::logic_error("The slice is out of range of the source matrix."); + + Clear(); + + m_computeDevice=fromMatrix.m_computeDevice; + m_externalBuffer=true; + m_numRows = fromMatrix.m_numRows; + m_pArray=fromMatrix.m_pArray + startColumn * m_numRows; + + m_elemSizeAllocated = GetNumElements(); + m_matrixName=NULL; + m_format = fromMatrix.m_format; + + return *this; + } + + + //for each column of a, we assign numRows starting from startIndex to this + template + GPUMatrix& GPUMatrix::AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.IsEmpty()) + throw std::logic_error("AssignRowSliceValuesOf: input matrix a is empty."); + + if (startIndex + numRows > a.GetNumRows()) + throw std::logic_error("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); + + Resize(numRows, a.GetNumCols()); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignRowSliceValuesOf<<>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)numRows, (long)a.GetNumRows()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + //for the row slice of this starting from startIndex we add a to it. + template + GPUMatrix& GPUMatrix::AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.IsEmpty()) + throw std::logic_error("AddToRowSliceValuesOf: input matrix a is empty."); + + if (a.GetNumRows() != numRows) + throw std::logic_error("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); + + if (startIndex + numRows > GetNumRows()) + throw std::logic_error("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); + + if (a.GetNumCols() != GetNumCols()) + throw std::logic_error("AddToRowSliceValuesOf: columns does not match."); + + LONG64 N=(LONG64)a.GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addToRowSliceValuesOf<<>>(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + //for each column of this, we add row slice of a starting from startIndex + template + GPUMatrix& GPUMatrix::AddWithRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.IsEmpty()) + throw std::logic_error("AddWithRowSliceValuesOf: input matrix a is empty."); + + if (GetNumRows() != numRows) + throw std::logic_error("AddWithRowSliceValuesOf: GetNumRows() != numRows."); + + if (startIndex + numRows > a.GetNumRows()) + throw std::logic_error("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); + + if (a.GetNumCols() != GetNumCols()) + throw std::logic_error("AddWithRowSliceValuesOf: columns does not match."); + + LONG64 N = (LONG64)GetNumElements(); + int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addWithRowSliceValuesOf << > >(m_pArray, a.m_pArray, N, (long)startIndex, (long)GetNumRows(), (long)a.GetNumRows()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignRepeatOf(const GPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) + { + if (this == &a) + throw std::logic_error("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); + + if (a.IsEmpty()) + throw std::logic_error("AssignRepeatOf: Matrix a is empty."); + + Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); + + LONG64 N = (LONG64)GetNumElements(); + long n = (long)a.GetNumCols(), m = (long)a.GetNumRows(); + int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignRepeatOf << > >(m_pArray, a.m_pArray, N, m, n, (long)GetNumRows()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix GPUMatrix::Transpose() const + { + if (IsEmpty()) + throw std::logic_error("Transpose: Matrix is empty."); + + GPUMatrix c(GetComputeDeviceId()); + c.AssignTransposeOf(*this); + return c; + } + + // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU + // computeDevice - The compute device for which the cublas handle is desired + // returns: cublas handle + // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends + template + cublasHandle_t GPUMatrix::GetCublasHandle(int computeDevice/*=-1*/) + { + // if the compute device is not passed, get the current device from CUDA + if (computeDevice < 0) + cudaGetDevice(&computeDevice); + + if (computeDevice < 0 || computeDevice >= MaxGpus) + throw std::logic_error("GetCublasHandle: Maximum GPU exceeded"); + cublasHandle_t cuHandle = s_cuHandle[computeDevice]; + if (cuHandle == NULL) + { + s_cuHandle[computeDevice] = cuHandle = _initCUBLAS(computeDevice); + } + CUBLAS_CALL(cublasSetStream(cuHandle, t_stream)); + + return cuHandle; + } + + template + GPUMatrix& GPUMatrix::AssignTransposeOf (const GPUMatrix& a) + { + if (this == &a) + throw std::logic_error("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); + + if (a.IsEmpty()) + throw std::logic_error("AssignTransposeOf: Matrix a is empty."); + + if (GetNumRows()!=a.GetNumCols() || GetNumCols()!=a.GetNumRows()) + Resize(a.GetNumCols(), a.GetNumRows()); + + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + cublasOperation_t transA = CUBLAS_OP_T; + cublasOperation_t transB = CUBLAS_OP_T; + int m = (int)a.m_numCols; + int n = (int)a.m_numRows; + ElemType alpha=1; + ElemType beta=0; + cublasStatus_t st; + if (sizeof(ElemType)==sizeof(float)) + { + st = cublasSgeam(cuHandle,transA,transB,m,n,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(&beta),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(m_pArray),(int)m_numRows); + } + else if (sizeof(ElemType)==sizeof(double)) + { + st = cublasDgeam(cuHandle,transA,transB,m,n,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(&beta),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(m_pArray),(int)m_numRows); + } + else + { + throw std::runtime_error("Unsupported template argument in GPUMatrix"); + } + if (st!=CUBLAS_STATUS_SUCCESS) + { + throw std::runtime_error("AssignTransposeOf failed"); + } + m_numRows=a.m_numCols; + m_numCols=a.m_numRows; + SetMatrixName(a.GetMatrixName()); + return *this; + } + + template + void GPUMatrix::SetValue(const ElemType v) + { + if (IsEmpty()) + throw std::logic_error("SetValue: Matrix is empty."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setValue<<>>(m_pArray,v,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory + { + if (IsEmpty()) + throw std::logic_error("SetValue: Matrix is empty."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setValue<<>>(m_pArray,d_v,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::SetColumn(const ElemType* colPointer, size_t colInd) + { + if (IsEmpty()) + throw std::logic_error("SetValue: Matrix is empty."); + if (colPointer==NULL) + return; + CUDA_CALL(cudaMemcpy(m_pArray+LocateColumn(colInd),colPointer,sizeof(ElemType)*m_numRows,cudaMemcpyHostToDevice)); + } + + template + void GPUMatrix::SetValue(const GPUMatrix& deepCopyFrom) + { + if (this == &deepCopyFrom) + return; + + Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols()); + m_format = deepCopyFrom.m_format; // copy the format over just to be sure + size_t cpSize = deepCopyFrom.GetNumRows() * deepCopyFrom.GetNumCols(); + if (cpSize != 0) + CUDA_CALL(cudaMemcpy(m_pArray,deepCopyFrom.m_pArray,cpSize*sizeof(ElemType),cudaMemcpyDeviceToDevice)); + } + + template + void GPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId) + { + // handle externally managed case + if (matrixFlags&matrixFlagDontOwnBuffer) + { + // free the existing array if it used to be an owned array + if (OwnBuffer() && m_pArray!=NULL) + { + PrepareDevice(); + CUDA_CALL(cudaFree(m_pArray)); + } + m_numRows = numRows; + m_numCols = numCols; + m_pArray = pArray; + m_elemSizeAllocated = GetNumElements(); + m_matrixName = NULL; + m_format = matrixFormatDense; + m_externalBuffer = true; + m_computeDevice = deviceId; + } + else + { + // if didn't previously own the buffer, wipe it clean + if (!OwnBuffer()) + { + ZeroInit(deviceId); + } + + // if the devices are different move it now + if (m_computeDevice != deviceId && deviceId >= 0) + { + Clear(); + ZeroInit(deviceId); + } + + // now resize/allocate as necessary + Resize(numRows, numCols); + m_externalBuffer = false; + + // copy over the content to the buffer + PrepareDevice(); + if (pArray!=NULL) + { + if (!(matrixFlags&matrixFormatRowMajor)) + { + CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(), + (matrixFlags&matrixFlagSetValueOnDevice)?cudaMemcpyDeviceToDevice:cudaMemcpyHostToDevice)); + } + else + { + throw std::runtime_error("Row major isn't implemented"); + } + } + } + m_format = matrixFormatDense; + } + + + template + void GPUMatrix::SetDiagonalValue(const ElemType v) + { + unsigned long N=(unsigned long)GetNumRows(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setDiagonalValue<<>>(m_pArray,v,N,(unsigned long)GetNumRows()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::SetDiagonalValue(GPUMatrix& vector) + { + if (IsEmpty() || vector.IsEmpty()) + throw std::logic_error("SetDiagonalValue: Matrix is empty."); + + if (GetNumRows() != GetNumCols()) + throw std::logic_error("SetDiagonalValue: NumRows and NumCols do not agree."); + + if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) + throw std::logic_error("SetDiagonalValue: input vector must be a vector."); + + if (vector.GetNumElements() == 1) //reduce to simple form + SetDiagonalValue(vector.m_pArray[0]); + + else if (vector.GetNumRows() != GetNumRows()) + throw std::logic_error("SetDiagonalValue: input vector's dimension does not agree with [this]."); + else + { + long N=(long)GetNumRows(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setDiagonalValueFromVector<<>>(m_pArray,vector.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + template + void GPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) + { + PrepareDevice(); + if (s_curandGenerator==NULL) + { + s_curandGenerator = new curandGenerator_t; + /* Create pseudo-random number generator */ + CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); + CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); + } + + cudaEvent_t done = nullptr; + CUDA_CALL(cudaEventCreate(&done)); + if (sizeof(ElemType)==sizeof(float)) + { + CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements())); + } + else + { + CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements())); + } + CUDA_CALL(cudaEventRecord(done)); + CUDA_CALL(cudaEventSynchronize(done)); + //CURAND_CALL(curandDestroyGenerator(gen)); + CUDA_CALL(cudaEventDestroy(done)); + + size_t N=GetNumElements(); + size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock); + + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _rescaleToRange<<>>(m_pArray,N,low,high); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) + { + PrepareDevice(); + if (s_curandGenerator==NULL) + { + s_curandGenerator = new curandGenerator_t; + /* Create pseudo-random number generator */ + CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); + CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); + } + + if (sizeof(ElemType)==sizeof(float)) + { + CURAND_CALL(curandGenerateNormal(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements(), (float)mean, (float)sigma)); + } + else + { + CURAND_CALL(curandGenerateNormalDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(m_pArray), GetNumElements(), (double)mean, (double)sigma)); + } + //CURAND_CALL(curandDestroyGenerator(gen)); + } + + //maskRate: percentage of values masked out (similar to dropout rate) + //scaleValue: which scale value to set to the left ones (unmasked items). + template + void GPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) + { + PrepareDevice(); + if (s_curandGenerator==NULL) + { + s_curandGenerator = new curandGenerator_t; + /* Create pseudo-random number generator */ + CURAND_CALL(curandCreateGenerator(&(((curandGenerator_t*)s_curandGenerator)[0]),CURAND_RNG_PSEUDO_XORWOW)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(((curandGenerator_t*)s_curandGenerator)[0], seed==USE_TIME_BASED_SEED ? time(NULL) : seed)); + CURAND_CALL(curandSetGeneratorOrdering(((curandGenerator_t*)s_curandGenerator)[0],CURAND_ORDERING_PSEUDO_SEEDED)); + } + + cudaEvent_t done = nullptr; + CUDA_CALL(cudaEventCreate(&done)); + if (sizeof(ElemType)==sizeof(float)) + { + CURAND_CALL(curandGenerateUniform((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(m_pArray), GetNumElements())); + } + else + { + CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(m_pArray), GetNumElements())); + } + CUDA_CALL(cudaEventRecord(done)); + CUDA_CALL(cudaEventSynchronize(done)); + CUDA_CALL(cudaEventDestroy(done)); + //CURAND_CALL(curandDestroyGenerator(gen)); + + size_t N=GetNumElements(); + size_t blocksPerGrid = (size_t)ceil(N/(double)threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setMaskAndScale<<>>(m_pArray,N,maskRate,scaleValue); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::Adagrad(GPUMatrix& gradients) + { + if (IsEmpty()) + { + Resize(gradients.GetNumRows(), gradients.GetNumCols()); + SetValue(0.0); + } + + assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols()); + + int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock; + _adagrad<<>>(m_pArray, gradients.m_pArray, GetNumElements()); + } + + template + void GPUMatrix::RmsProp(GPUMatrix& gradients, + ElemType RMS_GAMMA, + ElemType RMS_WGT_INC, + ElemType RMS_WGT_MAX, + ElemType RMS_WGT_DEC, + ElemType RMS_WGT_MIN + ) + { + const ElemType floor = 1e-6f; + static ElemType *upd_gpu = (ElemType*)0; + + size_t n = gradients.GetNumElements(); + int blocksPerGrid = (GetNumElements() + threadsPerBlock -1 )/threadsPerBlock; + + if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3) + { + Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3); + SetValue(0.0); + + ElemType *avars=m_pArray; // accumulated variances for RMS scaling + ElemType *signs=m_pArray+n; // sign of previous gradient + ElemType *steps=m_pArray+2*n; // current step size + + _rmsprop_init<<>>(avars,signs,steps,gradients.m_pArray,n); + + } + + ElemType *avars=m_pArray; // accumulated variances for RMS scaling + ElemType *signs=m_pArray+n; // sign of previous gradient + ElemType *steps=m_pArray+2*n; // current step size + + assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3); + + if( !upd_gpu ) + { + ElemType upd[] = { + 2,2,0, + 2,2,0, + 1,1,1, + 2,2,0, + 1,2,1, + 0,2,2, + 1,1,1, + 0,2,2, + 0,2,2, + }; + + CUDA_CALL(cudaMalloc((void**)&upd_gpu,sizeof(ElemType)*27)); + CUDA_CALL(cudaMemcpy(upd_gpu,upd,sizeof(ElemType)*27,cudaMemcpyHostToDevice)); + } + + _rmsprop<<>>(avars,signs,steps,gradients.m_pArray,n, + RMS_GAMMA,RMS_WGT_INC,RMS_WGT_MAX,RMS_WGT_DEC,RMS_WGT_MIN, + floor,upd_gpu); + } + + template + void GPUMatrix::Reshape(const size_t numRows, const size_t numCols) + { + assert (numRows*numCols == GetNumElements()); + if (numRows*numCols != GetNumElements()) + throw std::invalid_argument("Reshape: total number of elements does not match."); + + m_numRows = numRows; + m_numCols = numCols; + } + + template + void GPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly) + { + if (m_numRows==numRows && m_numCols==numCols) + return; + + m_numRows = numRows; + m_numCols = numCols; + + size_t numElements = GetNumElements(); + if (numElements > m_elemSizeAllocated || (!growOnly && numElements != m_elemSizeAllocated)) + { + if (IsEmpty()) + { + m_elemSizeAllocated = 0; + m_pArray = NULL; + } + else + { + if (!OwnBuffer()) + throw std::invalid_argument("Can't resize a externally managed matrix"); + PrepareDevice(); + if (m_pArray!=NULL) + CUDA_CALL(cudaFree(m_pArray)); //delete and reallocate + m_elemSizeAllocated = numElements; + CUDA_CALL(cudaMalloc((void**)&m_pArray,sizeof(ElemType)*m_elemSizeAllocated)); + CUDA_CALL(cudaMemset(m_pArray,0,sizeof(ElemType)*m_elemSizeAllocated)); + } + } + } + + template + size_t GPUMatrix::LocateElement (const size_t row, const size_t col) const + { + assert (row < m_numRows && col < m_numCols); + return col * m_numRows + row; // matrix in column-wise storage + } + + template + size_t GPUMatrix::LocateColumn (const size_t col) const + { + assert (col < m_numCols); + return col * m_numRows; // matrix in column-wise storage + } + + template + ElemType GPUMatrix::Get00Element() const + { + ElemType res=0; + CUDA_CALL(cudaMemcpy(&res,m_pArray,sizeof(ElemType),cudaMemcpyDeviceToHost)); + return res; + } +#pragma endregion Basic Operators + +#pragma region Member BLAS Functions + template + GPUMatrix& GPUMatrix::operator+= (ElemType alpha) + { + if (IsEmpty()) + throw std::logic_error("operator+=: Matrix is empty."); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addValue<<>>(m_pArray,alpha,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix GPUMatrix::operator+ (ElemType alpha) const + { + if (IsEmpty()) + throw std::logic_error("operator+: Matrix is empty."); + + const GPUMatrix& us=*this; + GPUMatrix c(us); + c+=alpha; + return c; + } + + template + GPUMatrix& GPUMatrix::AssignSumOf(const ElemType alpha, const GPUMatrix& a) + { + SetValue(a); + (*this)+=alpha; + return (*this); + } + + + template + GPUMatrix& GPUMatrix::operator+= (const GPUMatrix& a) + { + //if (a.GetNumElements()==1) + //{ + // //*this += a.Get00Element(); + // LONG64 N=(LONG64)GetNumElements(); + // int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + // cudaEvent_t done = nullptr; + // if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + // _addValue<<>>(m_pArray,a.m_pArray,N); + // if (do_sync) CUDA_CALL(cudaEventRecord(done)); + // if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + // if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + //} + //else + //{ + ScaleAndAdd(1, a, *this); + //} + return *this; + } + + template + GPUMatrix GPUMatrix::operator+ (const GPUMatrix& a) const + { + if (GetNumElements()==1) + { + GPUMatrix c(a); + c+=Get00Element(); + return c; + } + else if (a.GetNumElements()==1) + { + GPUMatrix c(*this); + c+=a.Get00Element(); + return c; + } + else + { + GPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code + c += a; + return c; + } + } + + template + GPUMatrix& GPUMatrix::AssignSumOf(const GPUMatrix& a, const GPUMatrix& b) + { + SetValue(a); + (*this)+=b; + return (*this); + } + + template + GPUMatrix& GPUMatrix::operator-= (ElemType alpha) + { + if (IsEmpty()) + throw std::logic_error("operato-=: Matrix is empty."); + return operator+=(-1*alpha); + } + + template + GPUMatrix GPUMatrix::operator- (ElemType alpha) const + { + if (IsEmpty()) + throw std::logic_error("operator-: Matrix is empty."); + return operator+(-1*alpha); + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const ElemType alpha, const GPUMatrix& a) + { + Resize(a.m_numRows,a.m_numCols); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignDifferenceOf1<<>>(m_pArray,alpha,a.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + /*Resize(a.m_numRows,a.m_numCols); + SetValue(alpha); + (*this)-=a; + return *this;*/ + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const ElemType alpha) + { + Resize(a.m_numRows,a.m_numCols); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignDifferenceOf2<<>>(m_pArray,alpha,a.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + /*SetValue(a); + (*this)-=alpha; + return *this;*/ + } + + template + GPUMatrix& GPUMatrix::operator-= (const GPUMatrix& a) + { + //if (a.GetNumElements() == 1) + // AssignDifferenceOf(*this, a.Get00Element()); + //else if (GetNumElements() == 1) + // AssignDifferenceOf(Get00Element(), a); + //else + ScaleAndAdd(-1, a, *this); + + return *this; + } + + template + GPUMatrix GPUMatrix::operator- (const GPUMatrix& a) const + { + GPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code + c -= a; + return c; + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const GPUMatrix& b) + { + if (this != &a) + { + Resize(a.GetNumRows(), a.GetNumCols()); + SetValue(a); + } + (*this) -= b; + return *this; + } + + template + GPUMatrix& GPUMatrix::operator*= (ElemType alpha) + { + Scale(alpha, *this); + return *this; + } + + template + GPUMatrix GPUMatrix::operator* (ElemType alpha) const + { + GPUMatrix c(GetNumRows(), GetNumCols()); + Scale(alpha, *this, c); + return c; + } + + template + GPUMatrix& GPUMatrix::AssignProductOf(const ElemType alpha, const GPUMatrix& a) + { + Scale(alpha, a, *this); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignProductOf (const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB) + { + if (a.GetNumElements() == 1) + { + if (transposeB) + AssignTransposeOf(b); + (*this) *= a.Get00Element(); + } + else if (b.GetNumElements() == 1) + { + if (transposeA) + AssignTransposeOf(a); + (*this) *= b.Get00Element(); + } + else + Multiply(a, transposeA, b, transposeB, *this); + return *this; + } + + template + GPUMatrix GPUMatrix::operator* (const GPUMatrix& a) const + { + const GPUMatrix& us = *this; + if (GetNumElements() == 1) + { + GPUMatrix c(GetComputeDeviceId()); + c.AssignProductOf(Get00Element(), a); + return c; + } + else if (a.GetNumElements() == 1) + { + GPUMatrix c(GetComputeDeviceId()); + c.AssignProductOf(a.Get00Element(), us); + return c; + } + else + { + GPUMatrix c(GetNumRows(),a.GetNumCols(),GetComputeDeviceId()); + Multiply(*this, a, c); + return c; + } + } + + template + GPUMatrix& GPUMatrix::operator/= (ElemType alpha) + { + (*this) *= 1/alpha; + return (*this); + } + + template + GPUMatrix GPUMatrix::operator/ (ElemType alpha) const + { + return ((*this) * (1/alpha)); + } + + //element-wise power + template + GPUMatrix& GPUMatrix::operator^= (ElemType alpha) + { + GPUMatrix& us = *this; + ElementWisePower(alpha, us, us); + return us; + } + + template + GPUMatrix GPUMatrix::operator^ (ElemType alpha) const + { + GPUMatrix c(GetNumRows(), GetNumCols()); + ElementWisePower(alpha, *this, c); + return c; + } + + template + GPUMatrix& GPUMatrix::AssignElementPowerOf(const GPUMatrix& a, const ElemType power) + { + ElementWisePower(power, a, *this); + return *this; + } + + + template + GPUMatrix& GPUMatrix::AddElementProductOf (const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AddElementProductOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + throw std::invalid_argument("The input matrix dimensions do not match."); + + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols())) + throw std::invalid_argument("The input matrix dimensions do not match [this]."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addElementProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::ColumnElementMultiplyWith(const GPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + throw std::logic_error("ColumnElementMultiplyWith: Matrix is empty."); + + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) + throw std::invalid_argument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows."); + + long N=(long)a.GetNumRows(); + long M=(long)GetNumCols(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _columnElementMultiplyWith<<>>(m_pArray,a.m_pArray,N,M); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::RowElementMultiplyWith(const GPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + throw std::logic_error("RowElementMultiplyWith: Matrix is empty."); + + if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) + throw std::invalid_argument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns."); + + long N = (long)GetNumRows(); + long M = (long)a.GetNumCols(); + int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _rowElementMultiplyWith<<>>(m_pArray,a.m_pArray,N,M); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::RowElementDivideBy(const GPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + throw std::logic_error("RowElementDivideBy: Matrix is empty."); + + if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) + throw std::invalid_argument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns."); + + long N = (long)GetNumRows(); + long M = (long)a.GetNumCols(); + int blocksPerGrid = (int)ceil(1.0*M / threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _rowElementDivideBy << > >(m_pArray, a.m_pArray, N, M); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::ColumnElementDivideBy(const GPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + throw std::logic_error("ColumnElementDivideBy: Matrix is empty."); + + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) + throw std::invalid_argument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows."); + + long N = (long)a.GetNumRows(); + long M = (long)GetNumCols(); + int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _ColumnElementDivideBy<<>>(m_pArray,a.m_pArray,N,M); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::ElementInverse () + { + if (IsEmpty()) + throw std::logic_error("ElementInverse: Matrix is empty."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _elemInverse<<>>(m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementInverseOf (const GPUMatrix& a) + { + SetValue(a); + return ElementInverse(); + } + + template + GPUMatrix& GPUMatrix::InplaceSigmoid() + { + performInplaceFunction(0); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSigmoidOf (const GPUMatrix& a) + { + Resize(a.GetNumRows(),a.GetNumCols()); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignSigmoidOf<<>>(a.m_pArray,m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + /*SetValue(a); + InplaceSigmoid();*/ + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSigmoidDerivative() + { + AssignSigmoidDerivativeOf(*this); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSigmoidDerivativeOf (const GPUMatrix& a) + { + if (a.IsEmpty()) + throw std::logic_error("AssignSigmoidDerivativeOf: Matrix a is empty."); + + //auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + PrepareDevice(); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + + _assignSigmoidDerivative<<>>(a.m_pArray, m_pArray, N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + + template + GPUMatrix& GPUMatrix::InplaceTanh() + { + performInplaceFunction(1); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTanhOf (const GPUMatrix& a) + { + SetValue(a); + InplaceTanh(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceLogSoftmax (const bool isColWise) + { + if (IsEmpty()) + throw std::logic_error("InplaceLogSoftmax: Matrix is empty."); + + PrepareDevice(); + if (isColWise) + { + long N=(long)GetNumCols(); //one kernel per column + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _logSoftMaxColWise<<>>(m_pArray,(long)m_numCols,(long)m_numRows); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + else + { + long N=(long)GetNumRows(); //one kernel per column + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _logSoftMaxRowWise<<>>(m_pArray,(long)m_numCols,(long)m_numRows); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignLogSoftmaxOf (const GPUMatrix& a, const bool isColWise) + { + Resize(a.GetNumRows(),a.GetNumCols()); + if (isColWise) + { + PrepareDevice(); + long N = (long)GetNumCols(); + long M = (long)GetNumRows(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignColumnwiseLogSoftmaxOf<<>>(a.m_pArray,m_pArray,N,M); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + else + { + NOT_IMPLEMENTED; + } + + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSqrt() + { + performInplaceFunction(2); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSqrtOf (const GPUMatrix& a) + { + SetValue(a); + InplaceSqrt(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceExp() + { + performInplaceFunction(3); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignExpOf (const GPUMatrix& a) + { + SetValue(a); + InplaceExp(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceLog() + { + performInplaceFunction(4); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignLogOf (const GPUMatrix& a) + { + SetValue(a); + InplaceLog(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceAbs() + { + performInplaceFunction(5); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignAbsOf (const GPUMatrix& a) + { + SetValue(a); + InplaceAbs(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceLinearRectifierDerivative() + { + performInplaceFunction(6); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignLinearRectifierDerivativeOf (const GPUMatrix& a) + { + SetValue(a); + InplaceLinearRectifierDerivative(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceCosine() + { + performInplaceFunction(7); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignCosineOf (const GPUMatrix& a) + { + SetValue(a); + InplaceCosine(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceNegativeSine() + { + performInplaceFunction(8); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignNegativeSineOf (const GPUMatrix& a) + { + SetValue(a); + InplaceNegativeSine(); + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceTruncateBottom (const ElemType threshold) + { + if (IsEmpty()) + throw std::logic_error("InplaceTruncateBottom: Matrix is empty."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _inplaceTruncateBottom<<>>(m_pArray,threshold,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTruncateBottomOf (const GPUMatrix& a, const ElemType threshold) + { + if (a.IsEmpty()) + throw std::logic_error("AssignTruncateBottomOf: Matrix a is empty."); + + if (this!=&a) + { + Resize(a.GetNumRows(), a.GetNumCols()); + } + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignTruncateBottom<<>>(m_pArray,a.m_pArray,threshold,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceTruncateTop (const ElemType threshold) + { + if (IsEmpty()) + throw std::logic_error("InplaceTruncateTop: Matrix is empty."); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _inplaceTruncateTop<<>>(m_pArray,threshold,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTruncateTopOf (const GPUMatrix& a, const ElemType threshold) + { + if (a.IsEmpty()) + throw std::logic_error("AssignTruncateTopOf: Matrix a is empty."); + + if (this!=&a) + { + Resize(a.GetNumRows(), a.GetNumCols()); + } + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignTruncateTop<<>>(m_pArray,a.m_pArray,threshold,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + template + GPUMatrix& GPUMatrix::SetToZeroIfAbsLessThan (const ElemType threshold) + { + if (IsEmpty()) + throw std::logic_error("SetToZeroIfAbsLessThan: Matrix is empty."); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _setToZeroIfAbsLessThan<<>>(m_pArray,threshold,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + ElemType GPUMatrix::SumOfAbsElements() const + { + if (IsEmpty()) + throw std::logic_error("SumOfAbsElements: Matrix is empty"); + + cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); + if (sizeof(ElemType)==sizeof(float)) + { + float res=0; + cublasSasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&res); + return res; + } + else + { + double res=0; + cublasDasum(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&res); + return ElemType(res); + } + } + + template + ElemType GPUMatrix::SumOfElements() const + { + if (IsEmpty()) + throw std::logic_error("SumOfElements: Matrix is empty"); + + PrepareDevice(); + ElemType* d_sum = NULL; + ElemType h_sum; + CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); + //WARNING: THIS kernel is not the most efficient way! + _reductionSum<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements()); + CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaFree(d_sum)); + return h_sum; + } + + + template + GPUMatrix& GPUMatrix::AssignSumOfElements(const GPUMatrix& a) + { + if (a.IsEmpty()) + throw std::logic_error("AssignSumOfElements: Matrix a is empty"); + + Resize(1,1); + + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + //WARNING: THIS kernel is not the most efficient way! + _reductionSumAndAssign<<<1,1024>>>(m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)GetNumElements()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return (*this); + } + + template + DeviceBoundNumber GPUMatrix::Sum_AsDeviceBoundNum() const + { + if (IsEmpty()) + throw std::logic_error("Matrix is empty"); + PrepareDevice(); + ElemType* d_sum = NULL; + CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); + //WARNING: THIS kernel is not the most efficient way! + _reductionSum<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements()); + DeviceBoundNumber result; + result.ShallowCopyFrom(d_sum,GetComputeDeviceId()); + return result; + } + + template + ElemType GPUMatrix::Max() const + { + cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); + ElemType res; + if (sizeof(ElemType)==sizeof(float)) + { + int resInd=0; + cublasIsamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&resInd); + resInd--; + CUDA_CALL(cudaMemcpy(reinterpret_cast(&res),reinterpret_cast(m_pArray+resInd),sizeof(float),cudaMemcpyDeviceToHost)); + return res; + } + else + { + int resInd=0; + cublasIdamax(cuHandle,(LONG64)GetNumElements(),reinterpret_cast(m_pArray),1,&resInd); + resInd--; + CUDA_CALL(cudaMemcpy(reinterpret_cast(&res),m_pArray+resInd,sizeof(float),cudaMemcpyDeviceToHost)); + return res; + } + } + + + template + GPUMatrix& GPUMatrix::ElementMultiplyWith (const GPUMatrix& a) + { + if (IsEmpty() || a.IsEmpty()) + throw std::logic_error("ElementMultiplyWith: Matrix is empty."); + + GPUMatrix& us=*this; + assert (us.GetNumRows() == a.GetNumRows() && us.GetNumCols() == a.GetNumCols()); + if (us.GetNumRows() != a.GetNumRows() || us.GetNumCols() != a.GetNumCols()) + throw std::invalid_argument("The matrix dimensions do not match."); + + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _elemMul<<>>(m_pArray,a.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementProductOf (const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AssignElementProductOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + throw std::invalid_argument("The input matrix dimensions do not match."); + + Resize(a.GetNumRows(), a.GetNumCols()); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignElementProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::ElementDivideBy(const GPUMatrix& a) + { + return AssignElementDivisionOf(*this, a); + } + + template + GPUMatrix& GPUMatrix::AssignElementDivisionOf (const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AssignElementDivisionOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + throw std::invalid_argument("The input matrix dimensions do not match."); + + Resize(a.GetNumRows(), a.GetNumCols()); + LONG64 N=(LONG64)GetNumElements(); + int blocksPerGrid =(int)ceil(((double)N)/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignElementDivisionOf<<>>(m_pArray,a.m_pArray,b.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + bool GPUMatrix::IsEqualTo(const GPUMatrix& a, const ElemType threshold /*= 1e-8*/) const + { + return AreEqual(*this, a, threshold); + } + + template + void GPUMatrix::VectorNorm1(GPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + throw std::logic_error("VectorNorm1: Matrix is empty."); + + const long n = (long)GetNumRows(); + const long m = (long)GetNumCols(); + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + cudaEvent_t done = nullptr; + PrepareDevice(); + c.ChangeDeviceTo(GetComputeDeviceId()); + + int blocksPerGrid=0; + if (isColWise) //col-wise + { + c.Resize(1,m); + blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); + } + else + { + c.Resize(n, 1); + c.ChangeDeviceTo(GetComputeDeviceId()); + blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); + } + + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _vectorNorm1<<>>(c.m_pArray, m_pArray,n,m,isColWise); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + GPUMatrix& GPUMatrix::AssignVectorNorm1Of(GPUMatrix& a, const bool isColWise) + { + a.VectorNorm1(*this, isColWise); + return *this; + } + + template + void GPUMatrix::VectorNorm2(GPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + throw std::logic_error("VectorNorm2: Matrix is empty."); + + const long n = (long)GetNumRows(); + const long m = (long)GetNumCols(); + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + cudaEvent_t done = nullptr; + PrepareDevice(); + c.ChangeDeviceTo(GetComputeDeviceId()); + + int blocksPerGrid=0; + if (isColWise) //col-wise + { + c.Resize(1,m); + blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); + } + else + { + c.Resize(n, 1); + c.ChangeDeviceTo(GetComputeDeviceId()); + blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); + } + + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _vectorNorm2<<>>(c.m_pArray, m_pArray,n,m,isColWise); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + GPUMatrix& GPUMatrix::AssignVectorNorm2Of(GPUMatrix& a, const bool isColWise) + { + a.VectorNorm2(*this, isColWise); + return *this; + } + + template + void GPUMatrix::VectorNormInf(GPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + throw std::logic_error("VectorMax: Matrix is empty."); + + //this implementation is not efficient + GPUMatrix tmp; + GPUMatrix tmp1; + tmp.AssignAbsOf((*this)); + tmp.VectorMax(tmp1,c,isColWise); + } + + template + GPUMatrix& GPUMatrix::AssignVectorNormInfOf(GPUMatrix& a, const bool isColWise) + { + a.VectorNormInf(*this, isColWise); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignInnerProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool isColWise) + { + InnerProduct (a, b, *this,isColWise); + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignKhatriRaoProductOf(const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AssignKhatriRaoProductOf: Matrix is empty."); + + long cols = a.GetNumCols(); + assert (cols == b.GetNumCols()); + if (!(cols == b.GetNumCols())) + throw std::invalid_argument("AssignKhatriRaoProductOf: The input matrix dimensions do not match."); + + long rowsA = (long)a.GetNumRows(); + long rowsB = (long)b.GetNumRows(); + Resize(rowsA * rowsB, cols); + float N=(float)GetNumElements(); + int blocksPerGrid =(int)ceil(N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignKhatriRaoProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient + // this = reshape each column of a from (K1xK2,1) to (K1, K2) + // if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames). + // the output is a (K1, frames) matrix + // if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames) + template + GPUMatrix& GPUMatrix::AddColumnReshapeProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool transposeAColumn) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AddColumnReshapeProductOf: Matrix is empty."); + + long cols = a.GetNumCols(); + assert (cols == b.GetNumCols()); + if (!(cols == b.GetNumCols())) + throw std::invalid_argument("AddColumnReshapeProductOf: The input matrix dimensions do not match."); + + long rowsA = (long)a.GetNumRows(); + long rowsB = (long)b.GetNumRows(); + if (rowsA % rowsB != 0) + throw std::invalid_argument("AddColumnReshapeProductOf: number of rows in a should be multiples of that in b."); + + long rowsC = rowsA / rowsB; + if (rowsC != GetNumRows() || cols != GetNumCols()) + throw std::invalid_argument("AddColumnReshapeProductOf: This matrix does not have the right size."); + + float N=(float)GetNumElements(); + int blocksPerGrid =(int)ceil(N/threadsPerBlock); + a.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addColumnReshapeProductOf<<>>(m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::AddWithScaleOf(ElemType alpha, const GPUMatrix& a) + { + ScaleAndAdd(alpha, a, *this); + return *this; + } + + template + ElemType GPUMatrix::FrobeniusNorm() const + { + if (IsEmpty()) + throw std::logic_error("FrobeniusNorm: Matrix is empty."); + + PrepareDevice(); + ElemType* d_sum = NULL; + ElemType h_sum=0; + CUDA_CALL(cudaMalloc((void**)&d_sum,sizeof(ElemType))); + //WARNING: THIS kernel is not the most efficient way! + _reductionSum2<<<1,1024,0,t_stream>>>(m_pArray,d_sum,(LONG64)GetNumElements(), true); + CUDA_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(ElemType),cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaFree(d_sum)); + + return (h_sum); + } + + template + GPUMatrix& GPUMatrix::AssignFrobeniusNormOf (const GPUMatrix& a) + { + if (a.IsEmpty()) + throw std::logic_error("AssignFrobeniusNormOf: Matrix a is empty."); + + Resize(1,1); + + PrepareDevice(); + //WARNING: THIS kernel is not the most efficient way! + _reductionSum2<<<1,1024,0,t_stream>>>(a.m_pArray,m_pArray,(LONG64)a.GetNumElements(), true); + + return *this; + } + + template + ElemType GPUMatrix::MatrixNormInf() const + { + if (IsEmpty()) + throw std::logic_error("MatrixNorm1: Matrix is empty."); + + PrepareDevice(); + ElemType* d_maxAbs = NULL; + ElemType h_maxAbs=0; + CUDA_CALL(cudaMalloc((void**)&d_maxAbs,sizeof(ElemType))); + //WARNING: THIS kernel is not the most efficient way! + _reductionMatrixNormInf<<<1,1024,0,t_stream>>>(m_pArray,d_maxAbs,(LONG64)GetNumElements()); + CUDA_CALL(cudaMemcpy(&h_maxAbs,d_maxAbs,sizeof(ElemType),cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaFree(d_maxAbs)); + return h_maxAbs; + } + + template + ElemType GPUMatrix::MatrixNorm1() const + { + if (IsEmpty()) + throw std::logic_error("MatrixNorm1: Matrix is empty."); + return SumOfAbsElements(); + } + + template + ElemType GPUMatrix::MatrixNorm0() const + { + if (IsEmpty()) + throw std::logic_error("MatrixNorm0: Matrix is empty."); + + PrepareDevice(); + ElemType* d_nz = NULL; + ElemType h_nz=0; + CUDA_CALL(cudaMalloc((void**)&d_nz,sizeof(ElemType))); + //WARNING: THIS kernel is not the most efficient way! + _reductionMatrixNorm0<<<1,1024,0,t_stream>>>(m_pArray,d_nz,(LONG64)GetNumElements()); + CUDA_CALL(cudaMemcpy(&h_nz,d_nz,sizeof(ElemType),cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaFree(d_nz)); + return h_nz; + } + + template + GPUMatrix& GPUMatrix::AssignSignOf(const GPUMatrix& a) + { + if (a.IsEmpty()) + throw std::logic_error("AssignSignOf: Matrix a is empty."); + + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + PrepareDevice(); + cudaEvent_t done = nullptr; + int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignSignOf<<>>(m_pArray, a.m_pArray, (long)GetNumElements()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + GPUMatrix& GPUMatrix::AddSignOf(const GPUMatrix& a) + { + if (a.IsEmpty()) + throw std::logic_error("AddSignOf: Matrix a is empty."); + + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + PrepareDevice(); + cudaEvent_t done = nullptr; + int blocksPerGrid=(int)ceil(1.0*GetNumElements()/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addSignOf<<>>(m_pArray, a.m_pArray, (LONG64)GetNumElements()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + + template + void GPUMatrix::VectorMax(GPUMatrix& maxIndexes, GPUMatrix& maxValues, const bool isColWise) const + { + if (IsEmpty()) + throw std::logic_error("VectorMax: Matrix is empty."); + + const GPUMatrix& us=*this; + const long m = (long)GetNumRows(); + const long n = (long)GetNumCols(); + assert (m>0 && n>0); //converting from size_t to int may cause overflow + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + if (isColWise) + { + maxValues.Resize(1, n); + maxIndexes.Resize(1, n); + + int blocksPerGrid = n; //we'll have 1 block processing 1 column + _vectorMaxMinReduce<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,true); + + /*int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + _vectorMax<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise);*/ + } + else + { + maxValues.Resize(m, 1); + maxIndexes.Resize(m, 1); + int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock); + _vectorMax<<>>(us.m_pArray,maxIndexes.m_pArray,maxValues.m_pArray,m,n,isColWise); + } + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::VectorMin(GPUMatrix& minIndexes, GPUMatrix& minValues, const bool isColWise) const + { + if (IsEmpty()) + throw std::logic_error("VectorMax: Matrix is empty."); + + const GPUMatrix& us=*this; + const int m = (int)GetNumRows(); + const int n = (int)GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + if (isColWise) + { + minValues.Resize(1, n); + minIndexes.Resize(1, n); + + int blocksPerGrid = n; //we'll have 1 block processing 1 column + _vectorMaxMinReduce<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,false); + + /* + int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + _vectorMin<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise);*/ + } + else + { + minValues.Resize(m, 1); + minIndexes.Resize(m, 1); + int blocksPerGrid=(int)ceil(1.0*m/threadsPerBlock); + _vectorMin<<>>(us.m_pArray,minIndexes.m_pArray,minValues.m_pArray,m,n,isColWise); + } + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + GPUMatrix& GPUMatrix::AssignNumOfDiff(const GPUMatrix& a, const GPUMatrix& b) + { + if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) + throw std::invalid_argument ("AssignNumOfDiff: a and b must have same dimension."); + + Resize(1,1); //result should be one element + + PrepareDevice(); + cudaEvent_t done = nullptr; + //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + //_assignNumOfDiff<<>>(a.m_pArray, b.m_pArray, m_pArray, a.GetNumElements()); + _assignNumOfDiff<<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, m_pArray, (LONG64)a.GetNumElements()); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + return *this; + } + +#pragma endregion Member BLAS Functions + +#pragma region Other helper functions + template + void GPUMatrix::Print(const char* /*matrixName*/, size_t /*rowStart*/, size_t /*rowEnd*/, size_t /*colStart*/, size_t /*colEnd*/) const + { + NOT_IMPLEMENTED; + } + + template + void GPUMatrix::Print(const char* matrixName /*=nullptr*/) const + { + Print(matrixName, 0, GetNumRows()-1, 0, GetNumCols()-1); + } + + // file I/O + //matrixName is used to verify that correct matrix is read. + template + void GPUMatrix::ReadFromFile(FILE*, const char * /*matrixName*/) + { + NOT_IMPLEMENTED; + } + + //matrixName is used to verify that correct matrix is read. + template + void GPUMatrix::WriteToFile(FILE*, const char * /*matrixName*/) + { + NOT_IMPLEMENTED; + } + + //helpfer function used for convolution neural network + template + GPUMatrix& GPUMatrix::AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) + { + assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); + + size_t packedInputRows = kernelWidth * kernelHeight * inputChannels; + size_t packedInputColsPerSample = outputWidth * outputHeight; + size_t smallBatchSize = inputSubBatch.GetNumCols(); + Resize(packedInputRows, packedInputColsPerSample * smallBatchSize); + if (zeroPadding) + SetValue((ElemType)0); + + PrepareDevice(); + int numThreadPerBlock = threadsPerBlock; +#if 1 + int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; +#else + dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize); +#endif + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignPackedConvolutionInput<<>>(m_pArray, + inputSubBatch.m_pArray, + smallBatchSize, + inputWidth, inputHeight, inputChannels, + outputWidth, outputHeight, outputChannels, + kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + //helpfer function used for convolution neural network + template + GPUMatrix& GPUMatrix::UnpackConvolutionInput(GPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) const + { + assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); + + size_t smallBatchSize = inputSubBatch.GetNumCols(); + + PrepareDevice(); + int numThreadPerBlock = threadsPerBlock; +#if 1 + int blocksPerGrid = (smallBatchSize * inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock; +#else + dim3 blocksPerGrid((inputWidth*inputHeight*inputChannels + numThreadPerBlock - 1)/numThreadPerBlock, smallBatchSize); +#endif + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _unpackConvolutionInput<<>>(m_pArray, + inputSubBatch.m_pArray, + smallBatchSize, + inputWidth, inputHeight, inputChannels, + outputWidth, outputHeight, outputChannels, + kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample, zeroPadding); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return inputSubBatch; + } + + template + GPUMatrix& GPUMatrix::AssignMaxPoolingResult(const GPUMatrix& inputBatch, const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); + + unsigned int batchSize = inputBatch.GetNumCols(); + Resize(outputSizePerSample, batchSize); + + int numThreadPerBlock = threadsPerBlock; + int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; + + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignMaxPoolingResult<<>>(m_pArray, inputBatch.m_pArray, batchSize, channels, + inputWidth, inputHeight,inputSizePerSample, + outputWidth, outputHeight, outputSizePerSample, + windowWidth, windowHeight, horizontalSubsample, verticalSubsample); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::AddMaxPoolingGradient(const GPUMatrix& outputGradientBatch, const GPUMatrix& inputBatch, const GPUMatrix& outputBatch, + const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); + + unsigned int batchSize = outputGradientBatch.GetNumCols(); + int numThreadPerBlock = threadsPerBlock; + + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + + int blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; + _addMaxPoolingGradient<<>>(m_pArray, outputGradientBatch.m_pArray, inputBatch.m_pArray, outputBatch.m_pArray, batchSize, channels, + inputWidth, inputHeight,inputSizePerSample, + outputWidth, outputHeight, outputSizePerSample, + windowWidth, windowHeight, horizontalSubsample, verticalSubsample); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignAveragePoolingResult(const GPUMatrix& inputBatch, const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); + + unsigned int batchSize = inputBatch.GetNumCols(); + Resize(outputSizePerSample, batchSize); + + int numThreadPerBlock = threadsPerBlock; + int blocksPerGrid = (batchSize * outputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; + + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignAveragePoolingResult<<>>(m_pArray, inputBatch.m_pArray, batchSize, channels, + inputWidth, inputHeight,inputSizePerSample, + outputWidth, outputHeight, outputSizePerSample, + windowWidth, windowHeight, horizontalSubsample, verticalSubsample); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + + template + GPUMatrix& GPUMatrix::AddAveragePoolingGradient(const GPUMatrix& outputGradientBatch, + const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + assert (verticalSubsample <= windowHeight && horizontalSubsample <= windowWidth); + + size_t batchSize = outputGradientBatch.GetNumCols(); + int numThreadPerBlock = threadsPerBlock; + + PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + + size_t blocksPerGrid = (batchSize * inputSizePerSample + numThreadPerBlock - 1)/numThreadPerBlock; + _addAveragePoolingGradient<<>>(m_pArray, outputGradientBatch.m_pArray, (long)batchSize, channels, + inputWidth, inputHeight,inputSizePerSample, + outputWidth, outputHeight, outputSizePerSample, + windowWidth, windowHeight, horizontalSubsample, verticalSubsample); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + + return *this; + } + +#pragma endregion Other helper functions + +#pragma region Static BLAS Functions + template + void GPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, + ElemType beta, GPUMatrix& c) + { + a.PrepareDevice(); + if ((a.GetComputeDeviceId()!=b.GetComputeDeviceId()) || (b.GetComputeDeviceId()!=c.GetComputeDeviceId())) //different GPUs + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + cublasHandle_t cuHandle = GetCublasHandle(b.GetComputeDeviceId()); + cublasOperation_t transA = transposeA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transB = transposeB ? CUBLAS_OP_T : CUBLAS_OP_N; + int m = int(transposeA ? a.m_numCols : a.m_numRows); + int n = int(transposeB ? b.m_numRows : b.m_numCols); + int k = int(transposeA ? a.m_numRows : a.m_numCols); + int l = int(transposeB ? b.m_numCols : b.m_numRows); + c.Resize(m,n); + + if (!(m>0 && k>0 && l>0 && n>0)) + { + throw std::runtime_error("!(m>0 && k>0 && l>0 && n>0)"); //converting from size_t to int may cause overflow + } + if (k!=l) + { + throw std::runtime_error("matrix dim mismatch in MultiplyAndWeightedAdd"); + } + if (sizeof(ElemType)==sizeof(float)) + { + CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(b.m_pArray),(int)b.m_numRows,reinterpret_cast(&beta),reinterpret_cast(c.m_pArray),(int)c.m_numRows)); + } + else if (sizeof(ElemType)==sizeof(double)) + { + CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast(&alpha),reinterpret_cast(a.m_pArray),(int)a.m_numRows,reinterpret_cast(b.m_pArray),(int)b.m_numRows,reinterpret_cast(&beta),reinterpret_cast(c.m_pArray),(int)c.m_numRows)); + } + else + { + throw std::runtime_error("Unsupported template argument in GPUMatrix"); + } + c.m_numRows=m; + c.m_numCols=n; + } + } + + template + void GPUMatrix::MultiplyAndAdd(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) + { + return GPUMatrix::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 1, c); + } + + template + void GPUMatrix::Multiply(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) + { + return GPUMatrix::MultiplyAndWeightedAdd(1, a, transposeA, b, transposeB, 0, c); + } + + template + void GPUMatrix::Multiply(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + { + return GPUMatrix::MultiplyAndWeightedAdd(1, a, false, b, false, 0, c); + } + + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c + /// if a is a column vector, add to all columns of c + /// if a is a row vector, add to all rows of c + /// if a is a scalar, add to all elements of c + /// Scalar + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::ScaleAndAdd(ElemType alpha,const GPUMatrix& a, GPUMatrix& c) + { + if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + a.PrepareDevice(); + if (a.IsEmpty() || c.IsEmpty()) + throw std::logic_error("ScaleAndAdd: one of the input matrices is empty."); + //if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector + if (a.GetNumRows()==c.GetNumRows() && a.GetNumCols()==c.GetNumCols()) // dimensions match + { + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int len = m * n; + const int incx = 1; + const int incy = 1; + + assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow + assert ((int)c.GetNumRows() == m && (int)c.GetNumCols() == n); + if ((int)c.GetNumRows() != m || (int)c.GetNumCols() != n) + throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a."); + + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + if (sizeof(ElemType) == sizeof(float)) + { + CUBLAS_CALL(cublasSaxpy(cuHandle,len,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),incx,reinterpret_cast (c.m_pArray) ,incy)); + } + else if (sizeof(ElemType) == sizeof(double)) + { + CUBLAS_CALL(cublasDaxpy(cuHandle,len,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),incx,reinterpret_cast (c.m_pArray) ,incy)); + } + else + { + throw std::runtime_error("Unsupported template argument in GPUMatrix"); + } + } + else if (a.GetNumElements() == 1) + { + LONG64 N=(LONG64)c.GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + c.PrepareDevice(); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _scaleAndAddScalar<<>>(c.m_pArray, N, alpha, a.m_pArray); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + else if (a.GetNumCols() == 1) //col vector, add it to all columns + { + long m = (long)c.GetNumRows(); + long n = (long)c.GetNumCols(); + if (m != (long)a.GetNumRows()) + throw std::invalid_argument("To add column vector, rows should match."); + + cudaEvent_t done = nullptr; + int blocksPerGrid = (int)(ceil(1.0*m*n / threadsPerBlock)); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); +#ifdef VALIDATION + printf(">>>> CUDA compute device is %d\n", a.GetComputeDeviceId()); + printf(">>>> a.m_pArray = %p, c.m_pArray = %p, alpha = %f, m = %ld, n = %ld\n", a.m_pArray,c.m_pArray,alpha,m,n); + for (int i=0; i < 2; i++) + { + ElemType buffer[10] = {-1.234f}; + cudaError_t error = cudaMemcpy(buffer, !i?a.m_pArray:c.m_pArray, sizeof(buffer), cudaMemcpyKind::cudaMemcpyDeviceToHost); + if (error == cudaError::cudaSuccess) + printf("buffer valid\n"); + } +#endif + + _matrixVectorColumnWiseAddWithThreadPerElem<<>>(a.m_pArray,c.m_pArray,alpha,m,n); + + + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + else if (a.GetNumRows()==1) //row vector, add it to all rows + { + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + int m = (int)c.GetNumRows(); + int n = (int)c.GetNumCols(); + assert (n == (int)a.GetNumCols()); + if (n != (int)a.GetNumCols()) + throw std::invalid_argument("To add row vector, cols should match."); + + if (sizeof(ElemType) == sizeof(double)) + { + foreach_row(i,c) + { + CUBLAS_CALL(cublasDaxpy(cuHandle,n,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),1,reinterpret_cast (c.m_pArray+i),m)); + } + } + else + { + foreach_row(i,c) + { + CUBLAS_CALL(cublasSaxpy(cuHandle,n,reinterpret_cast (&alpha),reinterpret_cast (a.m_pArray),1,reinterpret_cast (c.m_pArray+i),m)); + } + } + } + else + throw std::invalid_argument("Dimention of matrix c does not match dimention of matrix a."); + } + } + + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AddScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + { + if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + a.PrepareDevice(); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols())) + { + throw std::invalid_argument("AddScaledDifference: a, b, and c must have same dimension."); + } + + if (a.IsEmpty()) + throw std::logic_error("AddScaledDifference: Input matrix a is empty."); + + cudaEvent_t done = nullptr; + LONG64 n=(LONG64)a.GetNumElements(); + int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AssignScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + { + if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + a.PrepareDevice(); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() ); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + { + throw std::invalid_argument("AssignScaledDifference: a, b must have same dimension."); + } + + if (a.IsEmpty()) + throw std::logic_error("AssignScaledDifference: Input matrix a is empty."); + + if (&c != &a && &c != &b) + c.Resize(a.GetNumRows(), a.GetNumCols()); + + cudaEvent_t done = nullptr; + LONG64 n=(LONG64)a.GetNumElements(); + int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// 1X1 matrix + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AddScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + { + assert(alpha.GetNumElements() == 1); + if (!(alpha.GetNumElements() == 1)) + throw std::invalid_argument("AddScaledDifference: alpha must be a 1X1 matrix."); + + if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + a.PrepareDevice(); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols())) + { + throw std::invalid_argument("AddScaledDifference: a, b, and c must have same dimension."); + } + + if (a.IsEmpty()) + throw std::logic_error("AddScaledDifference: Input matrix a is empty."); + + cudaEvent_t done = nullptr; + LONG64 n=(LONG64)a.GetNumElements(); + int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AssignScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + { + assert(alpha.GetNumElements() == 1); + if (!(alpha.GetNumElements() == 1)) + throw std::invalid_argument("AddScaledDifference: alpha must be a 1X1 matrix."); + + if (a.GetComputeDeviceId()!=c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + a.PrepareDevice(); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() ); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + { + throw std::invalid_argument("AssignScaledDifference: a, b must have same dimension."); + } + + if (a.IsEmpty()) + throw std::logic_error("AssignScaledDifference: Input matrix a is empty."); + + c.Resize(a.GetNumRows(), a.GetNumCols()); + + cudaEvent_t done = nullptr; + LONG64 n=(LONG64)a.GetNumElements(); + int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _assignScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + //c[ci,cj] += a[ai,aj] + template + void GPUMatrix::AddElementToElement(const GPUMatrix& a, const size_t ai, const size_t aj, GPUMatrix& c, const size_t ci, const size_t cj) + { + if (ai >= a.GetNumRows() || aj >=a.GetNumCols() || + ci >= c.GetNumRows() || cj >=c.GetNumCols()) + throw std::invalid_argument("AddElementToElement: index out of range."); + + a.PrepareDevice(); + cudaEvent_t done = nullptr; + int blocksPerGrid=1; //only one element + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _addElementToElement<<>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj)); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + + template + void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) + { + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + if (sizeof(ElemType)==sizeof(float)) + { + float alph = (float)alpha; + CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(float*)a.m_pArray,1)); + } + else if (sizeof(ElemType)==sizeof(double)) + { + double alph = alpha; + CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),&alph,(double*)a.m_pArray,1)); + } + else + { + throw std::runtime_error("Unsupported template argument in GPUMatrix"); + } + } + + + template + void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& a) + { + if (alpha.GetNumElements()!=1) + { + throw std::runtime_error("Matrix alpha must be 1x1"); + } + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); + if (sizeof(ElemType)==sizeof(float)) + { + CUBLAS_CALL(cublasSscal(cuHandle,int(a.m_numRows*a.m_numCols),(float*)alpha.m_pArray,(float*)a.m_pArray,1)); + } + else if (sizeof(ElemType)==sizeof(double)) + { + CUBLAS_CALL(cublasDscal(cuHandle,int(a.m_numRows*a.m_numCols),(double*)alpha.m_pArray,(double*)a.m_pArray,1)); + } + else + { + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); + throw std::runtime_error("Unsupported template argument in GPUMatrix"); + } + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); + } + + template //c = alpha * a + void GPUMatrix::Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) + { + if (a.IsEmpty()) + throw std::logic_error("Scale: Input matrix a is empty."); + + c=a; + Scale(alpha,c); + } + + + template + void GPUMatrix::InnerProduct (const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const bool isColWise) + { + if (a.GetComputeDeviceId()!=b.GetComputeDeviceId() || b.GetComputeDeviceId()!=c.GetComputeDeviceId()) //different GPUs + throw std::invalid_argument("All matrices must be on the same GPU"); + + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("Scale: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) + throw std::invalid_argument("Matrices a and b should have same dimension."); + + if (isColWise) + c.Resize(1,n); + else + c.Resize(m,1); + + if ((isColWise && m == 1) || !isColWise && n == 1) //in this case it's equivalent to element-wise product + { + c.AssignElementProductOf(a, b); + } + else + { + cudaEvent_t done = nullptr; + c.PrepareDevice(); + + int blocksPerGrid=0; + if (isColWise) //col-wise + { + c.Resize(1,n); + blocksPerGrid =(int)ceil(1.0*n/threadsPerBlock); + } + else + { + c.Resize(m, 1); + blocksPerGrid =(int)ceil(1.0*m/threadsPerBlock); + } + + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + _innerProduct<<>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + template + ElemType GPUMatrix::InnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("InnerProductOfMatrices: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) + throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); + + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + if (sizeof(ElemType) == sizeof(double)) + { + double tmp=0; + CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,&tmp)); + return ElemType(tmp); + //return (ElemType)ddot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); + } + else + { + float tmp=0; + CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,&tmp)); + return tmp; + //return (ElemType)sdot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); + } + } + + + template + GPUMatrix& GPUMatrix::AssignInnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("InnerProductOfMatrices: one of the input matrices is empty."); + + Resize(1,1); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) + throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); + + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); + if (sizeof(ElemType) == sizeof(double)) + { + CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,reinterpret_cast (m_pArray))); + } + else + { + CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1,reinterpret_cast (m_pArray))); + } + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); + return *this; + } + + + template + void GPUMatrix::ElementWisePower(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) + { + if (a.GetComputeDeviceId() != c.GetComputeDeviceId()) + { + throw std::invalid_argument("All matrices must be on the same GPU"); + } + else + { + if (a.IsEmpty()) + throw std::logic_error("ElementWisePower: The input matrix a is empty."); + if (a.GetNumRows()!=c.GetNumRows() || a.GetNumCols()!=c.GetNumCols()) + throw std::logic_error("ElementWisePower: matrices must be of the same size"); + + cudaEvent_t done = nullptr; + a.PrepareDevice(); + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + LONG64 N=(LONG64)a.GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + _elementWisePowerOnCuda<<>>(alpha,a.m_pArray,c.m_pArray,N); + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } + } + + template + bool GPUMatrix::AreEqual(const GPUMatrix& a, const GPUMatrix& b, const ElemType threshold /*= 1e-8*/) + { + if (a.IsEmpty() || b.IsEmpty()) + throw std::logic_error("AreEqual: one of the input matrices is empty."); + + if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) + return false; + + a.PrepareDevice(); + long *res = new long[1]; + res[0]=1; + long *d_res = NULL; + CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(long)*1)); + CUDA_CALL(cudaMemcpy(d_res,res,sizeof(long)*1,cudaMemcpyHostToDevice)); + long N=(long)a.GetNumElements(); + int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); + _areEqual<<>>(a.m_pArray,b.m_pArray,N,threshold,d_res); + CUDA_CALL(cudaMemcpy(res,d_res,sizeof(long)*1,cudaMemcpyDeviceToHost)); + if (res[0]!=0) + return true; + else + return false; + } + + template + GPUMatrix GPUMatrix::Ones(const size_t rows, const size_t cols) + { + GPUMatrix c(rows, cols); //will initialize to 0 + c.SetValue(1); + return c; + } + + template + GPUMatrix GPUMatrix::Zeros(const size_t rows, const size_t cols) + { + GPUMatrix c(rows, cols); //will initialize to 0 + //c.SetValue(0); + return c; + } + + template + GPUMatrix GPUMatrix::Eye(const size_t rows) + { + GPUMatrix c(rows, rows); //will initialize to 0 + c.SetDiagonalValue(1); + return c; + } + + template + GPUMatrix GPUMatrix::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed) + { + GPUMatrix c(rows, cols); //will initialize to 0 + c.SetUniformRandomValue(low, high, seed); + return c; + } + + template + GPUMatrix GPUMatrix::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed) + { + GPUMatrix c(rows, cols); //will initialize to 0 + c.SetGaussianRandomValue(mean, sigma, seed); + return c; + } + + template + ElemType GPUMatrix::GetLearnRateForBlock_Helper(const GPUMatrix &Gradients, const GPUMatrix &SmoothedGradients) + { + Gradients.PrepareDevice(); + ElemType* d_res=NULL; + CUDA_CALL(cudaMalloc((void**)&d_res,sizeof(ElemType))); //we allocate memory on the device + + //Compute inner product of matrices and keep it on device + const int m = (int)Gradients.GetNumRows(); + const int n = (int)Gradients.GetNumCols(); + const int k = (int)SmoothedGradients.GetNumRows(); + const int l = (int)SmoothedGradients.GetNumCols(); + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) throw std::invalid_argument("InnerProductOfMatrices: Matrices a and b should have same dimension."); + + if (sizeof(ElemType) == sizeof(double)) + { + cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId()); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); + CUBLAS_CALL(cublasDdot(cuHandle,m*n, reinterpret_cast (Gradients.m_pArray), 1, reinterpret_cast (SmoothedGradients.m_pArray), 1,reinterpret_cast (d_res))); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); + } + else + { + cublasHandle_t cuHandle = GetCublasHandle(Gradients.GetComputeDeviceId()); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_DEVICE); + CUBLAS_CALL(cublasSdot(cuHandle,m*n, reinterpret_cast (Gradients.m_pArray), 1, reinterpret_cast (SmoothedGradients.m_pArray), 1,reinterpret_cast (d_res))); + cublasSetPointerMode(cuHandle, CUBLAS_POINTER_MODE_HOST); + } + // d_res[0] should now contain inner product of matrices + // Compute squared Frobenius norms (squared sums of elements) + _lrHelper<<<1,512,0,t_stream>>>(Gradients.m_pArray,SmoothedGradients.m_pArray, (LONG64)Gradients.GetNumElements(), d_res); + ElemType res; + CUDA_CALL(cudaMemcpy(&res,d_res,sizeof(ElemType),cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaFree(d_res)); + return res; + } + +#pragma endregion Static BLAS Functions + + + template class GPUMatrix; + template class GPUMatrix; + template class DeviceBoundNumber; + template class DeviceBoundNumber; + + template + cublasHandle_t GPUMatrix::s_cuHandle[GPUMatrix::MaxGpus]={0}; + + template + void* GPUMatrix::s_curandGenerator=NULL; +}}} + +// !!!!This is from helper_cuda.h which comes with CUDA samples!!!! Consider if it is beneficial to just include all helper_cuda.h +// TODO: This is duplicated in BestGpu.cpp +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { + { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + { -1, -1 } + }; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + return nGpuArchCoresPerSM[7].Cores; +}; +// end of GPU Architecture definitions + +//inline long _GetFreeMemoryOnCUDADevice(int devId) +//{ +// CUdevice cudaDevice; +// CUresult result = cuDeviceGet(&cudaDevice, devId); +// if(result!= CUDA_SUCCESS) +// { +// return 0; +// } +// +// //create cuda context +// CUcontext cudaContext; +// result = cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice); +// if(result != CUDA_SUCCESS) +// { +// return 0; +// } +// +// //get the amount of free memory on the graphics card +// size_t free; +// size_t total; +// result = cuMemGetInfo(&free, &total); +// if (result!=CUDA_SUCCESS) +// { +// return 0; +// } +// else +// return (long)free; +//} + +#endif // CPUONLY diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index c7e817239..2115bf205 100644 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -1,3266 +1,3405 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - -#include "BestGpu.h" - -#ifndef CPUONLY - -#include -#include -#include "CommonMatrix.h" -#include "device_functions.h" - - -#ifndef LONG64 //we would like to use 64-bit long to support large matrices. However, CUDA seems to support only 32-bit long -#define LONG64 long -#endif - -#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing -#define threadsPerBlock 512 - -// Predefine this for later. -static __inline__ __device__ double atomicAdd(double* address, double val); -//CUDA Kernels code -template -__global__ void _elementWisePowerOnCuda( - ElemType alpha, - const ElemType *a, - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (alpha==0) - { - c[id]=1; - } - else if (alpha==1) - { - c[id]=a[id]; - } - else if (alpha==2) - { - c[id]=a[id]*a[id]; - } - else if (alpha==3) - { - c[id]=a[id]*a[id]*a[id]; - } - else - { - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=pow(a[id],alpha); - } - else - { - c[id]=powf(a[id],alpha); - } - } -}; - -template -__global__ void _inplaceSigmoidOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - if (c[id]>=0) - { - double e = exp(-1*c[id]); - c[id]=1/(1+e); - } - else - { - double e = exp(c[id]); - c[id]=e/(1+e); - } - } - else - { - if (c[id]>=0) - { - float e = expf(-1*c[id]); - c[id]=1/(1+e); - } - else - { - float e = exp(c[id]); - c[id]=e/(1+e); - } - } -}; - -template -__global__ void _assignSigmoidOf( - const ElemType* a, - ElemType* res, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - if (a[id]>=0) - { - double e = exp(-1*a[id]); - res[id]=1/(1+e); - } - else - { - double e = exp(a[id]); - res[id]=e/(1+e); - } - } - else - { - if (a[id]>=0) - { - float e = expf(-1*a[id]); - res[id]=1/(1+e); - } - else - { - float e = exp(a[id]); - res[id]=e/(1+e); - } - } -}; - -template -__global__ void _inplaceLinRectDerivative( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (c[id]<=0) - c[id]=0; - else - c[id]=1; -} - -template -__global__ void _assignSigmoidDerivative( - ElemType *a, - ElemType *c, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] = a[id] * (1-a[id]); -} - -template -__global__ void _inplaceTanhOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=tanh(c[id]); - } - else - { - c[id]=tanhf(c[id]); - } - -}; - -//to prevent negative values caused by floating operations, we force inputs to be >=0 -//this may, however, hide problems in the caller. -template -__global__ void _inplaceSqrtOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=sqrt(max((ElemType)0, c[id])); - } - else - { - c[id]=sqrtf(max(ElemType(0), c[id])); - } -}; - -template -__global__ void _inplaceExpOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=exp(c[id]); - } - else - { - c[id]=expf(c[id]); - } -}; - -template -__global__ void _inplaceLogOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (c[id] -__global__ void _inplaceAbsOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=fabs(c[id]); - } - else - { - c[id]=fabsf(c[id]); - } -}; - -template -__global__ void _inplaceCosineOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=cos(c[id]); - } - else - { - c[id]=cosf(c[id]); - } -}; - -template -__global__ void _inplaceNegativeSineOnCuda( - ElemType* c, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(double)) - { - c[id]=-sin(c[id]); - } - else - { - c[id]=-sinf(c[id]); - } -}; - - -template -__global__ void _setValue( - ElemType* a, - const ElemType v, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]=v; -}; - -template -__global__ void _setValue( - ElemType* a, - const ElemType* d_v, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]=d_v[0]; -}; - -template -__global__ void _assignRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - long col = id / destRows; - long row = id - (col * destRows); - - //dest[id] = src[col*srcRows + row + startIndex]; - dest[id] = src[IDX2C(row + startIndex, col, srcRows)]; -} - -template -__global__ void _addToRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - long col = id / srcRows; //src is the full matrix, rowslice is taken from the dest - long row = id - (col * srcRows); - - //dest[col*destRows + row + startIndex] += src[id]; - dest[IDX2C(row + startIndex, col, destRows)] += src[id]; -} - -template -__global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const LONG64 N, const long startIndex, const long destRows, const long srcRows) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id >= N) - return; - - long col = id / destRows; //dest is the full matrix, rowslice is taken from the src - long row = id - (col * destRows); - - dest[id] += src[IDX2C(row + startIndex, col, srcRows)]; -} - -template -__global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id >= N) - return; - - long destCol = id / destRows; - long destRow = id - (destCol * destRows); - long srcRow = destRow % srcRows; - long srcCol = destCol % srcCols; - - dest[id] = src[IDX2C(srcRow,srcCol,srcRows)]; -} - -template -__global__ void _assignDifferenceOf1( - ElemType* us, - const ElemType alpha, - const ElemType* a, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - us[id]=alpha-a[id]; -}; - -template -__global__ void _assignDifferenceOf2( - ElemType* us, - const ElemType alpha, - const ElemType* a, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - us[id]=a[id]-alpha; -}; - -///a is a scalar -template -__global__ void _scaleAndAddScalar( - ElemType* c, - const LONG64 N, - const ElemType alpha, - const ElemType* a -) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] += alpha*a[0]; -}; - -template -__global__ void _addValue( - ElemType* a, - const ElemType v, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]+=v; -}; - -template -__global__ void _addValue( - ElemType* a, - const ElemType* d_v, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]+=d_v[0]; -}; - - -template -__global__ void _elemMul( - ElemType* a, - const ElemType* b, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]*=b[id]; -}; - -template -__global__ void _assignElementProductOf( - ElemType* us, - const ElemType* a, - const ElemType* b, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - us[id]=a[id]*b[id]; -} - -template -__global__ void _assignKhatriRaoProductOf( - ElemType* us, - const ElemType* a, - const ElemType* b, - const long rowsA, - const long rowsB, - const long cols) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - - const long rows = rowsA * rowsB; - const long col = id / rows; - if (col >= cols) - return; - - const long row = id % rows; - const long rowB = row / rowsA; - const long rowA = row % rowsA; - - us[id] = a[rowA + col * rowsA] * b[rowB + col * rowsB]; -} - -template -__global__ void _addColumnReshapeProductOf( - ElemType* us, - const ElemType* a, - const ElemType* b, - const long rowsB, - const long rowsC, - const long cols, - const bool transposeAColumn) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - - const long col = id / rowsC; - if (col >= cols) - return; - - const long row = id % rowsC; - long bBase = col * rowsB; - long aBase = bBase * rowsC; - ElemType v = 0; - - if (transposeAColumn) - { - aBase += row * rowsB; - for (long i=0; i -__global__ void _assignElementDivisionOf( - ElemType* us, - const ElemType* a, - const ElemType* b, - const LONG64 N) -{ - ElemType smallValue = EPS_IN_INVERSE; - - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - ElemType v = b[id]; - - if (v <0 && v > -smallValue) - us[id] = a[id]/(-smallValue); - else if (v >=0 && v < smallValue) - us[id] = a[id]/smallValue; - else - us[id]=a[id]/v; -} - -template -__global__ void _elemInverse( - ElemType* us, - const LONG64 N) -{ - ElemType smallValue = EPS_IN_INVERSE; - - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - if (us[id] <0 && us[id] > -smallValue) - us[id] = 1/-smallValue; - else if (us[id] >=0 && us[id] < smallValue) - us[id] = 1/smallValue; - else - us[id]=1/us[id]; -} - -template -__global__ void _logSoftMaxColWise( - ElemType *a, - const long m_numCols, - const long m_numRows) //ld -{ - int col_id = blockDim.x * blockIdx.x + threadIdx.x; - if (col_id>=m_numCols) - return; - - __shared__ ElemType maxV[threadsPerBlock]; - __shared__ ElemType Sum[threadsPerBlock]; - maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)]; - Sum[threadIdx.x]=0; - - for (long i=0;imaxV[threadIdx.x]) - { - maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)]; - } - } - - for (long i=0;i -//__global__ void _assignColumnwiseSoftmaxOf( -// const ElemType *a, -// ElemType* us, -// const long m_numCols, -// const long m_numRows) //thead per column -//{ -// int col_id = blockDim.x * blockIdx.x + threadIdx.x; -// if (col_id>=m_numCols) -// return; -// -// __shared__ ElemType maxV[threadsPerBlock]; -// __shared__ ElemType Sum[threadsPerBlock]; -// maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)]; -// Sum[threadIdx.x]=0; -// -// for (long i=0;imaxV[threadIdx.x]) -// { -// maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)]; -// } -// } -// -// for (long i=0;i -__global__ void _assignColumnwiseLogSoftmaxOf( - const ElemType *a, - ElemType* us, - const long m_numCols, - const long m_numRows) // each block processes one column. There must be 512 threads in a block -{ - //we first find max per column - __shared__ ElemType colMax[1]; - __shared__ ElemType partials[512]; - colMax[0]=-10000000; - partials[threadIdx.x]=-10000000; - - //int id = blockDim.x * blockIdx.x + threadIdx.x; - int loadPerThread = m_numRows/blockDim.x; - - for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i) - { - partials[threadIdx.x]=max(partials[threadIdx.x],a[IDX2C(i,blockIdx.x,m_numRows)]); - } - __syncthreads(); - - //256 - if (threadIdx.x<256) - { - partials[threadIdx.x]=max(partials[threadIdx.x+256],partials[threadIdx.x]); - } - __syncthreads(); - - //128 - if (threadIdx.x<128) - { - partials[threadIdx.x]=max(partials[threadIdx.x+128],partials[threadIdx.x]); - } - __syncthreads(); - - //64 - if (threadIdx.x<64) - { - partials[threadIdx.x]=max(partials[threadIdx.x+64],partials[threadIdx.x]); - } - __syncthreads(); - - //32 - if (threadIdx.x<32) - { - partials[threadIdx.x]=max(partials[threadIdx.x+32],partials[threadIdx.x]); - } - __syncthreads(); - - //16 - if (threadIdx.x<16) - { - partials[threadIdx.x]=max(partials[threadIdx.x+16],partials[threadIdx.x]); - } - __syncthreads(); - - //8 - if (threadIdx.x<8) - { - partials[threadIdx.x]=max(partials[threadIdx.x+8],partials[threadIdx.x]); - } - __syncthreads(); - - //4 - if (threadIdx.x<4) - { - partials[threadIdx.x]=max(partials[threadIdx.x+4],partials[threadIdx.x]); - } - __syncthreads(); - - if (threadIdx.x==0) - { - colMax[0] = max(max(partials[0],partials[1]),max(partials[2],partials[3])); - } - partials[threadIdx.x]=0.0f; - __syncthreads(); - //end of finding max - //now start finding sums - __shared__ ElemType colSum[1]; - colSum[0]=0.0f; - for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i) - { - ElemType tmp=a[IDX2C(i,blockIdx.x,m_numRows)]-colMax[0]; - us[IDX2C(i,blockIdx.x,m_numRows)]=tmp; - partials[threadIdx.x]+=(sizeof(ElemType)==sizeof(float)?expf(tmp):exp(tmp)); - } - __syncthreads(); - - //256 - if (threadIdx.x<256) - { - partials[threadIdx.x]+=partials[threadIdx.x+256]; - } - __syncthreads(); - - //128 - if (threadIdx.x<128) - { - partials[threadIdx.x]+=partials[threadIdx.x+128]; - } - __syncthreads(); - - //64 - if (threadIdx.x<64) - { - partials[threadIdx.x]+=partials[threadIdx.x+64]; - } - __syncthreads(); - - //32 - if (threadIdx.x<32) - { - partials[threadIdx.x]+=partials[threadIdx.x+32]; - } - __syncthreads(); - - //16 - if (threadIdx.x<16) - { - partials[threadIdx.x]+=partials[threadIdx.x+16]; - } - __syncthreads(); - - //8 - if (threadIdx.x<8) - { - partials[threadIdx.x]+=partials[threadIdx.x+8]; - } - __syncthreads(); - - //4 - if (threadIdx.x<4) - { - partials[threadIdx.x]+=partials[threadIdx.x+4]; - } - __syncthreads(); - - if (threadIdx.x==0) - { - colSum[0] = partials[0]+partials[1]+partials[2]+partials[3]; - colSum[0] = (sizeof(ElemType)==sizeof(float)?logf(colSum[0]):log(colSum[0])); - } - __syncthreads(); - //end of finding sums - for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m_numRows : (threadIdx.x+1)*loadPerThread);++i) - { - us[IDX2C(i,blockIdx.x,m_numRows)]-=colSum[0]; - } -} - -template -__global__ void _logSoftMaxRowWise( - ElemType *a, - const long m_numCols, - const long m_numRows) //ld -{ - int row_id = blockDim.x * blockIdx.x + threadIdx.x; - if (row_id>=m_numRows) - return; - - __shared__ ElemType maxV[threadsPerBlock]; - __shared__ ElemType Sum[threadsPerBlock]; - maxV[threadIdx.x]=a[IDX2C(row_id,0,m_numRows)]; - Sum[threadIdx.x]=0; - - for (long j=0;jmaxV[threadIdx.x]) - { - maxV[threadIdx.x]=a[IDX2C(row_id,j,m_numRows)]; - } - } - - for (long j=0;j -__global__ void _inplaceTruncateBottom( - ElemType* a, - const ElemType threshold, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id] -__global__ void _assignTruncateBottom( - ElemType* us, - const ElemType* a, - const ElemType threshold, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id] -__global__ void _inplaceTruncateTop( - ElemType* a, - const ElemType threshold, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id]>threshold) - a[id]=threshold; -} - -template -__global__ void _assignTruncateTop( - ElemType* us, - const ElemType* a, - const ElemType threshold, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id]>threshold) - us[id]=threshold; - else - us[id]=a[id]; -} - -template -__global__ void _setToZeroIfAbsLessThan( - ElemType* a, - const ElemType threshold, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (sizeof(ElemType)==sizeof(float)) - { - if (fabsf(a[id]) -__global__ void _areEqual( - const ElemType* a, - const ElemType* b, - const LONG64 N, - const ElemType threshold, - long *d_res) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - if (sizeof(ElemType)==sizeof(float)) - { - if (fabsf(a[id]-b[id]) > threshold) - { - d_res[0]=0; - } - } - else - { - if (fabs(1.0*a[id]-1.0*b[id]) > threshold) - { - d_res[0]=0; - } - } - -} - -template -__global__ void _setDiagonalValue( - ElemType* a, - const ElemType v, - const unsigned long N, - const unsigned long ld) -{ - int id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[IDX2C(id,id,ld)]=v; - -} - -template -__global__ void _setDiagonalValueFromVector( - ElemType* a, - const ElemType* b, - const long N) -{ - int id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[IDX2C(id,id,N)]=b[id]; -} - -template -__global__ void _adagrad( - ElemType* a, - ElemType* d_v, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id >= N) - return; - - const ElemType floor = 1e-16f; - - a[id] += d_v[id] * d_v[id]; - d_v[id] /= sqrt(a[id]+floor); -} - -template -__global__ void _rmsprop_init( - ElemType* avars, ElemType* signs, ElemType* steps, - ElemType* curr_grad, - const LONG64 N - ) -{ - LONG64 i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= N) - return; - - ElemType tmp = curr_grad[i]; - avars[i] = tmp * tmp; - signs[i] = ElemType(0.0); - steps[i] = ElemType(0.02); -} - -template -__global__ void _rmsprop( - ElemType* avars, ElemType* signs, ElemType* steps, - ElemType* curr_grad, - const LONG64 N, - ElemType RMS_GAMMA,ElemType RMS_WGT_INC,ElemType RMS_WGT_MAX,ElemType RMS_WGT_DEC,ElemType RMS_WGT_MIN, - ElemType floor, - ElemType *upd_gpu - ) -{ - LONG64 i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= N) - return; - - avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0)-RMS_GAMMA)* (curr_grad[i] * curr_grad[i]); - - //// grad sign base 3: 0->neg, 1->zero, 2->pos - //const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); - - //// signs[i] contains three consecutive grad_sign - //signs[i] = 3*(int(signs[i]) % 9) + grad_sign; - - //// update according to the following table: - //// (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC - //// (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC - //// otherwise: no action - - //switch(int(upd_gpu[int(signs[i])])) - //{ - //case 0: - // steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); - // break; - //case 2: - // steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); - // break; - //} - //curr_grad[i] *= steps[i] / sqrt(avars[i] + floor); - - const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); - - if( signs[i] * grad_sign > 0 ) - steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); - else - steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); - - curr_grad[i] *= steps[i] / sqrt(avars[i] + floor); - signs[i] = grad_sign; - -} - -template -__global__ void _rescaleToRange( - ElemType* a, - const LONG64 N, - const ElemType low, - const ElemType high) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]=a[id]*(high-low)+low; -} - -template -__global__ void _setMaskAndScale( - ElemType* a, - const LONG64 N, - const ElemType maskRate, - const ElemType scaleValue) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - a[id]=a[id]<=maskRate? 0 : scaleValue; -} - -template -__global__ void _vectorNorm1( - ElemType* c, //output - const ElemType* a, //input - const long n, //a.numRows - const long m, //a.numCols - const bool isColWise) -{ - int id = blockDim.x * blockIdx.x + threadIdx.x; - if ((isColWise && id>=m)||(!isColWise && id>=n)) - return; - - ElemType sum = 0; - - if (isColWise) - { - for (long i=0;i -__global__ void _vectorNorm2( - ElemType* c, //output - const ElemType* a, //input - const long N, //a.GetNumRows(); - const long M, //a.GetNumCols(); - const bool isColWise) -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if ((isColWise && id>=M) || (!isColWise && id>=N)) - return; - - ElemType sum = 0; - if (isColWise) - { - for (long i=0;i -__global__ void _convertInd2ValsAdjustInd( - ElemType* inds, - const ElemType* M, - ElemType* vals, - const long n, //number of cols - const long m, //number of rows - const bool isColWise) -{ - int id = blockDim.x * blockIdx.x + threadIdx.x; - if ((isColWise && id>=n)||(!isColWise && id>=m)) - return; - inds[id]--; - if (isColWise) - { - vals[id]=M[IDX2C((int)inds[id],id,m)]; - } - else - { - vals[id]=M[IDX2C(id,(int)inds[id],m)]; - } -} - - - //assume each column is an input sample. Each sample is stored in [channel, row, col] (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11) -template -__global__ void _assignPackedConvolutionInput(ElemType * packedMatrix, const ElemType * inputSubBatch, const long batchSize, - const long inputWidth, const long inputHeight, const long inputChannels, - const long outputWidth, const long outputHeight, const long outputChannels, - const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding) -{ - const long inputHeightTimesChannel = inputHeight * inputChannels; - const size_t inputDim = inputWidth*inputHeightTimesChannel; - - const long idall = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = idall / inputDim; - if (sample >= batchSize) - return; - - const long id = idall % inputDim; - const long y = id / inputHeightTimesChannel; //inputCol - - const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels; - const size_t packedInputColsPerSample = outputWidth * outputHeight; //output size per channel - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels) - // IN_ELEM_COLPOS = sample - - const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels - const long x = nXC / inputChannels; //inputRow - const long c = nXC % inputChannels; //channel - - ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; - - long x0 = 0, y0 = 0, x1 = 0, y1 = 0; - if (zeroPadding) - { - const long halfKernelWidth = kernelWidth/2; - const long halfKernelHeight = kernelHeight/2; - - x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample)); //row : first wrow in which x is in - x1 = x+halfKernelHeight-x0*verticalSubsample; //first posxInKernel - y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample)); //col : first wcol in which y is in - y1 = y+halfKernelWidth-y0*horizontalSubsample; //first posyInKernel - } - else - { - x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample)); //row : first wrow in which x is in - x1 = x-x0*verticalSubsample; //first posxInKernel - y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample)); //col : first wcol in which y is in - y1 = y-y0*horizontalSubsample; //first posyInKernel - } - - // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight) - // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow - - long packColBase = sample*packedInputColsPerSample + y0*outputHeight; - for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) - { - long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight; - for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) - { - const long packRow = packRowBase + posxInKernel; - const long packCol = packColBase + wrow; - packedMatrix[packRow + packCol*packedInputRows] = currentInputValue; - } - packColBase += outputHeight; - } -} - - //assume each column is an input sample. Each sample is stored in [channel, row, col] (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11) -template -__global__ void _unpackConvolutionInput(const ElemType * packedMatrix, ElemType * inputSubBatch, const long batchSize, - const long inputWidth, const long inputHeight, const long inputChannels, - const long outputWidth, const long outputHeight, const long outputChannels, - const long kernelWidth, const long kernelHeight, const long horizontalSubsample, const long verticalSubsample, const bool zeroPadding) -{ - const long inputHeightTimesChannel = inputHeight * inputChannels; - const size_t inputDim = inputWidth*inputHeightTimesChannel; - - const long idall = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = idall / inputDim; - if (sample >= batchSize) - return; - - const long id = idall % inputDim; - const long y = id / inputHeightTimesChannel; //inputCol - - const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels; - const size_t packedInputColsPerSample = outputWidth * outputHeight; //output size per channel - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels) - // IN_ELEM_COLPOS = sample - - const long nXC = id % inputHeightTimesChannel; //channel + inputRow*inputChannels - const long x = nXC / inputChannels; //inputRow - const long c = nXC % inputChannels; //channel - - long x0 = 0, y0 = 0, x1 = 0, y1 = 0; - if (zeroPadding) - { - const long halfKernelWidth = kernelWidth/2; - const long halfKernelHeight = kernelHeight/2; - - x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1.0f+halfKernelHeight)/ (ElemType)verticalSubsample)); //row : first wrow in which x is in - x1 = x+halfKernelHeight-x0*verticalSubsample; //first posxInKernel - y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1.0f+halfKernelWidth)/(ElemType)horizontalSubsample)); //col : first wcol in which y is in - y1 = y+halfKernelWidth-y0*horizontalSubsample; //first posyInKernel - } - else - { - x0 = max(0.0f, ceil((x-(ElemType)kernelHeight+1)/ (ElemType)verticalSubsample)); //row : first wrow in which x is in - x1 = x-x0*verticalSubsample; //first posxInKernel - y0 = max(0.0f, ceil((y-(ElemType)kernelWidth+1)/(ElemType)horizontalSubsample)); //col : first wcol in which y is in - y1 = y-y0*horizontalSubsample; //first posyInKernel - } - - // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight) - // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow - - ElemType currentInputValue = inputSubBatch[id + sample*inputDim]; - long packColBase = sample*packedInputColsPerSample + y0*outputHeight; - for (long wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel>=0; wcol++, posyInKernel -= horizontalSubsample) - { - long packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight; - for (long wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel>=0; wrow++, posxInKernel -= verticalSubsample) - { - const long packRow = packRowBase + posxInKernel; - const long packCol = packColBase + wrow; - currentInputValue += packedMatrix[packRow + packCol*packedInputRows]; - } - packColBase += outputHeight; - } - - inputSubBatch[id + sample*inputDim] = currentInputValue; -} - -template -__global__ void _assignMaxPoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels, - const long inputWidth, const long inputHeight, const long inputSizePerSample, - const long outputWidth, const long outputHeight, const long outputSizePerSample, - const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample) -{ - const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = outputIndex / outputSizePerSample; - if (sample >= batchSize) - return; - - const long outputIndexWithinSample = outputIndex % outputSizePerSample; - const long inputHeightTimesChannel = inputHeight * channels; - const long outputHeightTimesChannel = outputHeight * channels; - - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) - // IN_ELEM_COLPOS = sample - - // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) - // OUT_ELEM_COLPOS = sample - - const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol - const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels - const long x = nXC / channels; //wrow - const long c = nXC % channels; //channel - - const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample; - register ElemType maxVal = -FLT_MAX; - const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c; - for (long colInWindow=0; colInWindow -__global__ void _addMaxPoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, - const long batchSize, const long channels, - const long inputWidth, const long inputHeight, const long inputSizePerSample, - const long outputWidth, const long outputHeight, const long outputSizePerSample, - const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample) -{ - const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = inputIndex / inputSizePerSample; - if (sample >= batchSize) - return; - - const long inputIndexWithinSample = inputIndex % inputSizePerSample; - - const long inputHeightTimesChannel = inputHeight * channels; - const long outputHeightTimesChannel = outputHeight * channels; - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) - // IN_ELEM_COLPOS = sample - - // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) - // OUT_ELEM_COLPOS = sample - - const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input - const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels - const long x = nXC / channels; //row in input - const long c = nXC % channels; //channel - - long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample)); //inclusive start - long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end - long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample)); //inclusive start - long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end - - - ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample; - const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample; - const ElemType * outputBatchBase4Sample = outputBatch + sample*outputSizePerSample; - - ElemType inputValue = inputBatch[inputIndexWithinSample + sample*inputSizePerSample]; - for (long outY=startOutY; outY<=endOutY; outY++) - { - for (long outX=startOutX; outX<=endOutX; outX++) - { - long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; - if (inputValue == outputBatchBase4Sample[outputIndex]) - inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex]; - } - } -} -template -__global__ void _assignAveragePoolingResult(ElemType * outputBatch, const ElemType * inputBatch, const long batchSize, const long channels, - const long inputWidth, const long inputHeight, const long inputSizePerSample, - const long outputWidth, const long outputHeight, const long outputSizePerSample, - const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample) -{ - const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = outputIndex / outputSizePerSample; - if (sample >= batchSize) - return; - - const long outputIndexWithinSample = outputIndex % outputSizePerSample; - const long inputHeightTimesChannel = inputHeight * channels; - const long outputHeightTimesChannel = outputHeight * channels; - - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) - // IN_ELEM_COLPOS = sample - - // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) - // OUT_ELEM_COLPOS = sample - - const long y = outputIndexWithinSample / outputHeightTimesChannel; //wcol - const long nXC = outputIndexWithinSample % outputHeightTimesChannel; //channel + wrow*channels - const long x = nXC / channels; //wrow - const long c = nXC % channels; //channel - - const ElemType *inputBatchBase4Sample = inputBatch + sample*inputSizePerSample; - - register ElemType average = 0; - const long rowInWindowBase = (x*verticalSubsample + y*horizontalSubsample*inputHeight)*channels+c; - for (long colInWindow=0; colInWindow -__global__ void _addAveragePoolingGradient(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, - const long batchSize, const long channels, - const long inputWidth, const long inputHeight, const long inputSizePerSample, - const long outputWidth, const long outputHeight, const long outputSizePerSample, - const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample) -{ - const long inputIndex = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = inputIndex / inputSizePerSample; - if (sample >= batchSize) - return; - - const long inputIndexWithinSample = inputIndex % inputSizePerSample; - - const long inputHeightTimesChannel = inputHeight * channels; - const long outputHeightTimesChannel = outputHeight * channels; - const long windowSize = windowWidth * windowHeight; - - // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) - // IN_ELEM_COLPOS = sample - - // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) - // OUT_ELEM_COLPOS = sample - - const long y = inputIndexWithinSample / inputHeightTimesChannel; //col in input - const long nXC = inputIndexWithinSample % inputHeightTimesChannel; //channel + row*chanels - const long x = nXC / channels; //row in input - const long c = nXC % channels; //channel - - long startOutX = max(0.0f, ceil((x-(ElemType)windowHeight+1)/ (ElemType)verticalSubsample)); //inclusive start - long endOutX = (x/verticalSubsample < outputHeight-1)? x/verticalSubsample : outputHeight-1; //inclusive end - long startOutY = max(0.0f, ceil((y-(ElemType)windowWidth+1)/(ElemType)horizontalSubsample)); //inclusive start - long endOutY = (x/horizontalSubsample < outputWidth-1)? x/horizontalSubsample : outputWidth-1; //inclusive end - - ElemType *inputGradientBatchBase4Sample = inputGradientBatch + sample*inputSizePerSample; - const ElemType *outputGradientBatchBase4Sample = outputGradientBatch + sample*outputSizePerSample; - - for (long outY=startOutY; outY<=endOutY; outY++) - { - for (long outX=startOutX; outX<=endOutX; outX++) - { - long outputIndex = outY * outputHeightTimesChannel + outX * channels + c; - inputGradientBatchBase4Sample[inputIndexWithinSample] += outputGradientBatchBase4Sample[outputIndex]/windowSize; - } - } -} - -template -__global__ void _addMaxPoolingGradientLoopOut(ElemType * inputGradientBatch, const ElemType * outputGradientBatch, const ElemType * inputBatch, const ElemType * outputBatch, - const long batchSize, const long channels, - const long inputWidth, const long inputHeight, const long inputSizePerSample, - const long outputWidth, const long outputHeight, const long outputSizePerSample, - const long windowWidth, const long windowHeight, const long horizontalSubsample, const long verticalSubsample) -{ - const long outputIndex = blockIdx.x * blockDim.x + threadIdx.x; - const long sample = outputIndex / outputSizePerSample; - if (sample >= batchSize) - return; - - const long outputIndexWithinSample = outputIndex % outputSizePerSample; - const long inputWidthTimesChannel = inputWidth * channels; - const long outputWidthTimesChannel = outputWidth * channels; - const long y = outputIndexWithinSample / outputWidthTimesChannel; - const long nXC = outputIndexWithinSample % outputWidthTimesChannel; - const long x = nXC / channels; - const long c = nXC % channels; - - const long offset0 = sample*inputSizePerSample + y*verticalSubsample*inputWidthTimesChannel + x*horizontalSubsample*channels; - const ElemType *pCurWindow4Input = inputBatch + offset0; // pooling to current window's first input pixel - ElemType *pCurWindow4InGradient = inputGradientBatch + offset0; - for (long yy=0; yy -__global__ void _addElementProductOf( - ElemType* us, - const ElemType* a, - const ElemType* b, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - us[id]+=(a[id]*b[id]); -} - -template -__global__ void _columnElementMultiplyWith( - ElemType* us, - const ElemType* a, - const long N, //a.GetNumRows(); - const long M) //us.GetNumCols(); -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - //__shared__ ElemType _a[threadsPerBlock]; - //_a[threadIdx.x]=a[id]; - ElemType mul=a[id]; - for (long j=0;j -__global__ void _rowElementMultiplyWith( - ElemType* us, - const ElemType* a, - const long N, //us.GetNumRows(); - const long M) //a.GetNumCols(); -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=M) - return; - - //__shared__ ElemType _a[threadsPerBlock]; - //_a[threadIdx.x]=a[id]; - ElemType mul=a[id]; - for (long i=0;i -__global__ void _rowElementDivideBy( - ElemType* us, - const ElemType* a, - const long N, //us.GetNumRows(); - const long M) //a.GetNumCols(); -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if (id >= M) - return; - - //__shared__ ElemType _a[threadsPerBlock]; - //_a[threadIdx.x]=a[id]; - ElemType v = a[id]; - if (v >= 0 && v < EPS_IN_INVERSE) - v = EPS_IN_INVERSE; - else if (v < 0 && v > -EPS_IN_INVERSE) - v = (-EPS_IN_INVERSE); - - for (long i = 0; i -__global__ void _ColumnElementDivideBy( - ElemType* us, - const ElemType* a, - const long N, //a.GetNumRows(); - const long M) //us.GetNumCols(); -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - - ElemType smallValue = EPS_IN_INVERSE; - - //__shared__ ElemType _a[threadsPerBlock]; - //_a[threadIdx.x]=a[id]; - ElemType v=a[id]; - for (long j=0;j -smallValue) - us[IDX2C(id,j,N)] /= (-smallValue); - else if (v >=0 && v < smallValue) - us[IDX2C(id,j,N)] /= smallValue; - else - us[IDX2C(id,j,N)] /= v; - } - -} - - -template -__global__ void _innerProduct( - ElemType* c, - const ElemType* a, - const ElemType* b, - const long N, //a.GetNumRows(); - const long M, //a.GetNumCols(); - const bool isColWise) -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if ((isColWise && id>=M) || (!isColWise && id>=N)) - return; - - ElemType sum = 0; - long index; - if (isColWise) - { - for (long i=0; i -__global__ void _assignSignOf( - ElemType* a, - const ElemType* b, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - ElemType v = b[id]; - a[id] = (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1))); -} - -template -__global__ void _addSignOf( - ElemType* a, - const ElemType* b, - const LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - ElemType v = b[id]; - a[id] += (v == (ElemType)0? (ElemType)0 : (v > 0? (ElemType)1 : (ElemType)(-1))); -} - -template -__global__ void _vectorMaxMinReduce( //this function processes 1 column per block. this function needs 512 threads - const ElemType* us, - ElemType* Indexes, - ElemType* Values, - const long m, //number of rows - const long n, - bool isMax) //number of cols -{ - //we first find max per column - __shared__ ElemType partials[512]; - __shared__ int partialsInd[512]; - if (isMax) - { - partials[threadIdx.x]=-10000000; - } - else - { - partials[threadIdx.x]=10000000; - } - partialsInd[threadIdx.x]=-1; - - //int id = blockDim.x * blockIdx.x + threadIdx.x; - int loadPerThread = m/blockDim.x; - - for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? m : (threadIdx.x+1)*loadPerThread);++i) - { - if (( isMax ? us[IDX2C(i,blockIdx.x,m)]>partials[threadIdx.x] : us[IDX2C(i,blockIdx.x,m)]partials[threadIdx.x] : partials[threadIdx.x+256]partials[threadIdx.x] : partials[threadIdx.x+128]partials[threadIdx.x] : partials[threadIdx.x+64]partials[threadIdx.x] : partials[threadIdx.x+32]partials[threadIdx.x] : partials[threadIdx.x+16]partials[threadIdx.x] : partials[threadIdx.x+8]partials[threadIdx.x] : partials[threadIdx.x+4]partials[1]) || ind ==-1) - { - mx = partials[1]; - ind = partialsInd[1]; - } - if ((isMax ? mxpartials[2]) || ind ==-1) - { - mx = partials[2]; - ind = partialsInd[2]; - } - if ((isMax ? mxpartials[3]) || ind ==-1) - { - mx = partials[3]; - ind = partialsInd[3]; - } - Values[blockIdx.x] = mx; - Indexes[blockIdx.x] = ind; - } -} - -template -__global__ void _vectorMax( - const ElemType* us, - ElemType* maxIndexes, - ElemType* maxValues, - const long m, //number of rows - const long n, //number of cols - const bool isColWise) -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - long maxInd = -1; - ElemType maxVal = -100000; - - if (isColWise) - { - if (id>=n) - return; - - for (long i=0;i=maxVal) - { - maxInd = i; - maxVal = us[IDX2C(i,id,m)]; - } - } - } - else - { - if (id>=m) - return; - - for (long j=0;j=maxVal) - { - maxInd = j; - maxVal = us[IDX2C(id,j,m)]; - } - } - } - maxIndexes[id]=maxInd; - maxValues[id]=maxVal; -} - -template -__global__ void _vectorMin( - const ElemType* us, - ElemType* minIndexes, - ElemType* minValues, - const long m, //number of rows - const long n, //number of cols - const bool isColWise) -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - long minInd = -1; - ElemType minVal = -100000; - - if (isColWise) - { - if (id>=n) - return; - - for (long i=0;i=m) - return; - - for (long j=0;j -__global__ void _matrixVectorColumnWiseAddWithThreadPerElem( - const ElemType* a, - ElemType* us, - ElemType alpha, - const long m, //number of rows - const long n) //number of cols -{ - long id = blockDim.x * blockIdx.x + threadIdx.x; - if (id >= m*n) - return; - - long col = id / m; - long row = id - col*m; - - us[id] += alpha*a[row]; -} - -template -__global__ void _matrixVectorColumnWiseAddWithThreadPerRow( - const ElemType* a, - ElemType* us, - ElemType alpha, - const long m, //number of rows - const long n) //number of cols -{ -#ifdef VALIDATION - if (blockDim.x * blockIdx.x + threadIdx.x == 0) - { - printf("** _matrixVectorColumnWiseAdd on device:\na = %p, us = %p, alpha = %f, m = %ld, n = %ld\n", - a,us,alpha,m,n); - printf("us[0] = %f\n", us[0]); - printf("a[0] = %f\n", a[0]); - } -#endif - int id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=m) - return; - ElemType tmp = a[id]; -#ifdef VALIDATION - printf(" a[%d] = %f\n", id, tmp); -#endif - for (long j = 0; j < n; ++j ) - { - us[j*m+id] += alpha*tmp; - } - -} - - -template -__global__ void _matrixVectorColumnWiseAddBlockPerRow( - const ElemType* a, - ElemType* us, - ElemType alpha, - const long m, //number of rows - const long n) //number of cols -{ - ElemType tmp; - - if (threadIdx.x==0) - { - tmp = a[blockIdx.x]; - } - __syncthreads(); - - int loadPerThread = n/blockDim.x; - - for (int i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? n : (threadIdx.x+1)*loadPerThread);++i) - { - us[m*blockIdx.x + i] += alpha*tmp; - } -} - - - -template -__global__ void _addScaledDifference( - ElemType alpha, - ElemType *a, - ElemType *b, - ElemType *c, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] = c[id] + (a[id]-b[id]) * (alpha); -} - -template -__global__ void _assignScaledDifference( - ElemType alpha, - ElemType *a, - ElemType *b, - ElemType *c, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] = (a[id]-b[id]) * (alpha); -} - -template -__global__ void _addScaledDifference( - ElemType *alpha, - ElemType *a, - ElemType *b, - ElemType *c, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] = c[id] + (a[id]-b[id]) * alpha[0]; -} - -template -__global__ void _assignScaledDifference( - ElemType *alpha, - ElemType *a, - ElemType *b, - ElemType *c, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - c[id] = (a[id]-b[id]) * alpha[0]; -} - -template -__global__ void _addElementToElement( - const ElemType *a, LONG64 indexA, - ElemType *c, LONG64 indexC) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>0) - return; - c[indexC] += a[indexA]; -} - -template -__global__ void _assignNumOfDiff( - const ElemType *a, - const ElemType *b, - ElemType *c, - LONG64 N) -{ - __shared__ ElemType partialSums[1024]; - partialSums[threadIdx.x]=0; - //int id = blockDim.x * blockIdx.x + threadIdx.x; - LONG64 loadPerThread = N/blockDim.x; - for (LONG64 i= threadIdx.x*loadPerThread; i< (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x+1)*loadPerThread);++i) - { - partialSums[threadIdx.x]+=(a[i] != b[i]); - } - __syncthreads(); - - //512 - if (threadIdx.x<512) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+512]; - } - __syncthreads(); - - //256 - if (threadIdx.x<256) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+256]; - } - __syncthreads(); - - //128 - if (threadIdx.x<128) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+128]; - } - __syncthreads(); - - //64 - if (threadIdx.x<64) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+64]; - } - __syncthreads(); - - //32 - if (threadIdx.x<32) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+32]; - } - __syncthreads(); - - //16 - if (threadIdx.x<16) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+16]; - } - __syncthreads(); - - //8 - if (threadIdx.x<8) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+8]; - } - __syncthreads(); - - //4 - if (threadIdx.x<4) - { - partialSums[threadIdx.x]+=partialSums[threadIdx.x+4]; - } - __syncthreads(); - - if (threadIdx.x==0) - { - c[0] = partialSums[0]+partialSums[1]+partialSums[2]+partialSums[3]; - } -} - - -/*template -__global__ void _assignNumOfDiff( -ElemType *a, -ElemType *b, -ElemType *c, -long N) -{ -//TO DO: replace atomic operation with reduction - -__shared__ int totalSum; -if (threadIdx.x == 0) totalSum = 0; -__syncthreads(); - -int id = blockDim.x * blockIdx.x + threadIdx.x; -if (id>=N) -return; - -int localVal = (a[id] != b[id]); -atomicAdd(&totalSum, localVal); -__syncthreads(); - -c[id] = totalSum; -}*/ - -template -__global__ void _scaleArray( - ElemType alpha, - ElemType *us, - LONG64 N) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - us[id]=us[id]*alpha; -} - - -template -__global__ void _sparseCSRPlusDense( - ElemType alpha, - const ElemType* m_dVal, - const int* m_dRow, - const int* m_dCol, - ElemType* pArrayDev, - LONG64 M) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=M) - return; - int start = m_dRow[id]; - int end = m_dRow[id+1]; - for (int _i=start;_i -__global__ void _sparseCSRElemMulDense( - const ElemType* m_dVal, - const int* m_dRow, - const int* m_dCol, - const ElemType* b, - ElemType* c, - LONG64 M) -{ - LONG64 id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=M) - return; - int start = m_dRow[id]; - int end = m_dRow[id+1]; - for (int _i=start;_i