diff --git a/.gitignore b/.gitignore
index 00a6a56c7..c08ca3570 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,162 +1,168 @@
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.sln.docstates
-*.orig
-
-# Build results
-
-[Dd]ebug/
-[Rr]elease/
-x64/
-build/
-[Bb]in/
-[Oo]bj/
-
-# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
-!packages/*/build/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-*_i.c
-*_p.c
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.log
-*.scc
-*.dep
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opensdf
-*.sdf
-*.cachefile
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-*.ncrunch*
-.*crunch*.local.xml
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.Publish.xml
-
-# NuGet Packages Directory
-## TODO: If you have NuGet Package Restore enabled, uncomment the next line
-#packages/
-
-# Windows Azure Build Output
-csx
-*.build.csdef
-
-# Windows Store app package directory
-AppPackages/
-
-# Others
-sql/
-*.Cache
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.[Pp]ublish.xml
-*.pfx
-*.publishsettings
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file to a newer
-# Visual Studio version. Backup files are not needed, because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-App_Data/*.mdf
-App_Data/*.ldf
-
-
-#LightSwitch generated files
-GeneratedArtifacts/
-_Pvt_Extensions/
-ModelManifest.xml
-
-# =========================
-# Windows detritus
-# =========================
-
-# Windows image file caches
-Thumbs.db
-ehthumbs.db
-
-# Folder config file
-Desktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Mac desktop service store files
-.DS_Store
-
-*.lyx~
-*.bak
-*.lyx#
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+*.orig
+
+# Build results
+
+[Dd]ebug/
+[Rr]elease/
+x64/
+build/
+[Bb]in/
+[Oo]bj/
+
+# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
+!packages/*/build/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.log
+*.scc
+*.dep
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+*.ncrunch*
+.*crunch*.local.xml
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.Publish.xml
+
+# NuGet Packages Directory
+## TODO: If you have NuGet Package Restore enabled, uncomment the next line
+#packages/
+
+# Windows Azure Build Output
+csx
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Others
+sql/
+*.Cache
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.[Pp]ublish.xml
+*.pfx
+*.publishsettings
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+App_Data/*.mdf
+App_Data/*.ldf
+
+
+#LightSwitch generated files
+GeneratedArtifacts/
+_Pvt_Extensions/
+ModelManifest.xml
+
+# =========================
+# Windows detritus
+# =========================
+
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Mac desktop service store files
+.DS_Store
+
+*.lyx~
+*.bak
+*.lyx#
+
+# =========================
+# prebuild file
+# =========================
+MachineLearning/cn/buildinfo.h
+
diff --git a/Common/ConfigFile.cpp b/Common/ConfigFile.cpp
index f3902df4b..f5eb505a3 100644
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@@ -1,279 +1,280 @@
-//
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// ConfigFile.cpp : Defines the configuration file loader.
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "File.h"
-#include "commandArgUtil.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-
- // ParseCommandLine - parse the command line parameters
- // argc - count of arguments
- // argv - array of argument parameters
- // config - config to return
- std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
- {
- config.SetName(std::string("global"));
-
- // This vector keeps track of the config files we have already read
- std::vector resolvedConfigFiles;
- std::string configString;
-
- // start at 1, because 0 is the name of the EXE
- for (int i=1; i < argc; ++i)
- {
- wstring str = argv[i];
-
- // see if they are loading a config file
- wstring configDescriptor = L"configFile=";
- int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
-
- // no config file, parse as regular argument
- if (compare)
- {
- configString += (msra::strfun::utf8(str) + "\n");
- }
- else // One or more config file paths specified in a "+"-separated list.
- {
- const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
- std::vector filePathsVec = msra::strfun::split(filePaths, "+");
- for (auto filePath : filePathsVec)
- {
- if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
- {
- // if haven't already read this file, read it
- resolvedConfigFiles.push_back(filePath);
- configString += config.ReadConfigFile(filePath);
- }
- else
- RuntimeError("Cannot specify same config file multiple times at the command line.");
- }
- }
- }
-
- configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
- config.FileParse(configString);
- return configString;
- }
-
- // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
- // form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
- // If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
- // and includes the body of each file in the string which is eventually returned by this function. If the included
- // config file includes other config files, this function will recursively include those files as well.
- // configString - the config string within which to look for "include" statements
- // resolvedConfigFiles - the paths to all the config files that have already been resolved. This vector is used to prevent include loops,
- // and to prevent files from being included multiple times.
- // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
- std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector &resolvedConfigFiles)
- {
- std::vector lines = msra::strfun::split(configString, "\n");
- std::string includeKeyword = "include=";
- std::size_t includeKeywordSize = includeKeyword.size();
- std::string newConfigString;
- for (std::string line : lines)
- {
- if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
- {
- std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
- if (filePaths.find(openBraceVar) != std::string::npos)
- {
- RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
- }
-
- std::vector filePathVec = msra::strfun::split (filePaths, "+");
- for (auto filePath : filePathVec)
- {
- // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
- if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
- {
- // Recursively resolve the include statements in the included config files.
- // Ensure that the same config file isn't included twice, by keeping track of the config
- // files that have already been resolved in the resolvedPaths vector.
- resolvedConfigFiles.push_back(filePath);
- newConfigString += ResolveIncludeStatements(
- ReadConfigFile(filePath),
- resolvedConfigFiles
- );
- }
- else
- {
- // We already resolved this path. Write a warning so that user is aware of this.
- // TODO: This message is written to stderr before stderr gets redirected to the specified file. Fix this.
- fprintf(stderr, "Warning: Config file included multiple times. Not including config file again: %s", filePath.c_str());
- }
- }
- }
- else
- {
- newConfigString += (line + "\n");
- }
- }
- return newConfigString;
- }
-
- // LoadConfigFiles - load multiple configuration file, and adds to config parameters
- // filePaths - A "+" delimited list of file paths, corresponding to config files to load
- // configStringToAppend - A config string which should be processed together with the config files
- void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
- {
- std::string configString = ReadConfigFiles(filePaths);
- if(configStringToAppend != nullptr)
- {
- configString += *configStringToAppend;
- }
-
- FileParse(configString);
- }
-
- // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
- // If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
- // then this method will resolve those variables. This method is meant for the processing of NDL/MEL config files,
- // in order to allow them to access variables defined in the primary config file via $varName$ syntax.
- // filePath - filePath to the file to load
- // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
- void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
- {
- // read file, resolve variables, and then parse.
- std::string fileContents = ReadConfigFile(filePath);
- fileContents = config.ResolveVariables(fileContents);
- FileParse(fileContents);
- }
-
- // LoadConfigFile - load a configuration file, and add to config parameters
- // filePath - filePath to the file to read
- void ConfigParser::LoadConfigFile(const std::wstring &filePath)
- {
- // read and then parse
- FileParse(ReadConfigFile(filePath));
- }
-
- // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
- std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
- {
- return ReadConfigFiles(msra::strfun::utf16(filePaths));
- }
-
- // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
- // filePaths - A "+" delimited list of file paths, corresponding to config files to read
- // returns: a string with the concatentated file contents
- std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
- {
- std::string configString;
- std::vector filePathVec = msra::strfun::split (filePaths, L"+");
- for (auto filePath : filePathVec)
- {
- configString += ReadConfigFile(filePath);
- }
- return configString;
- }
-
- // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
- std::string ConfigParser::ReadConfigFile(const std::string &filePath)
- {
- return ReadConfigFile(msra::strfun::utf16(filePath));
- }
-
- // ReadConfigFile - read a configuration file, and return as a string
- // filePath - the path to the config file to read
- // returns: a string with the concatentated file contents
- std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
- {
- File file(filePath, fileOptionsRead);
-
- // initialize with file name
- std::string path = msra::strfun::utf8(filePath);
- auto location = path.find_last_of("/\\");
- if (location != npos)
- path = path.substr(location+1);
- m_configName = move(path);
-
- // read the entire file into a string
- // CONSIDER: should the File API support this, instead of line by line?
- size_t fileLength = file.Size();
- string str;
- string configFile;
- configFile.reserve(fileLength);
- while (!file.IsEOF())
- {
- file.GetLine(str);
- str = PreprocessConfigLine(str);
- if (str != "")
- {
- configFile.append(str);
- configFile.append("\n");
- }
- }
- return configFile;
- }
-
- // GetFileConfigNames - determine the names of the features and labels sections in the config file
- // features - [in,out] a vector of feature name strings
- // labels - [in,out] a vector of label name strings
- void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels)
- {
- for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
- {
- auto pair = *iter;
- ConfigParameters temp (iter->second);
- // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
- if (temp.ExistsCurrent("dim"))
- {
- if (temp.ExistsCurrent("labelMappingFile")
- || temp.ExistsCurrent("labelDim")
- || temp.ExistsCurrent("labelType")
- || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
- {
- labels.push_back(msra::strfun::utf16(iter->first));
- }
- else
- {
- features.push_back(msra::strfun::utf16(iter->first));
- }
- }
- }
- }
-
- // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
- // config - configuration to search
- // key - string we ar searching for in each config section
- // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
- void FindConfigNames(const ConfigParameters& config, std::string key, std::vector& names)
- {
- for (auto iter = config.begin(); iter != config.end(); ++iter)
- {
- auto pair = *iter;
- ConfigParameters temp (iter->second);
- // see if we have a config parameters that contains a "key" element, if so use it
- if (temp.ExistsCurrent(key))
- {
- names.push_back(msra::strfun::utf16(iter->first));
- }
- }
- }
-
- // Trim - trim white space off the start and end of the string
- // str - string to trim
- // NOTE: if the entire string is empty, then the string will be set to an empty string
- void Trim(std::string& str)
- {
- auto found = str.find_first_not_of(" \t");
- if (found == npos)
- {
- str.erase(0);
- return;
- }
- str.erase(0, found);
- found = str.find_last_not_of(" \t");
- if (found != npos)
- str.erase(found+1);
- }
-
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// ConfigFile.cpp : Defines the configuration file loader.
+//
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
+#include "File.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+
+ // ParseCommandLine - parse the command line parameters
+ // argc - count of arguments
+ // argv - array of argument parameters
+ // config - config to return
+ std::string ConfigParameters::ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config)
+ {
+ config.SetName(std::string("global"));
+
+ // This vector keeps track of the config files we have already read
+ std::vector resolvedConfigFiles;
+ std::string configString;
+
+ // start at 1, because 0 is the name of the EXE
+ for (int i=1; i < argc; ++i)
+ {
+ wstring str = argv[i];
+
+ // see if they are loading a config file
+ wstring configDescriptor = L"configFile=";
+ int compare = _wcsnicmp(configDescriptor.c_str(), str.c_str(), configDescriptor.length());
+
+ // no config file, parse as regular argument
+ if (compare)
+ {
+ configString += (msra::strfun::utf8(str) + "\n");
+ }
+ else // One or more config file paths specified in a "+"-separated list.
+ {
+ const std::string filePaths = msra::strfun::utf8(str.substr(configDescriptor.length()));
+ std::vector filePathsVec = msra::strfun::split(filePaths, "+");
+ for (auto filePath : filePathsVec)
+ {
+ if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+ {
+ // if haven't already read this file, read it
+ resolvedConfigFiles.push_back(filePath);
+ configString += config.ReadConfigFile(filePath);
+ }
+ else
+ RuntimeError("Cannot specify same config file multiple times at the command line.");
+ }
+ }
+ }
+
+ configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
+ config.FileParse(configString);
+ return configString;
+ }
+
+ // ResolveIncludeStatements - this function takes a config string, and looks for all lines of the
+ // form "include=configPaths", where 'configPaths' is a "+" separated list of paths to config files.
+ // If it encounters one of these lines, it reads the config files listed in 'configPaths' (in the specified order),
+ // and includes the body of each file in the string which is eventually returned by this function. If the included
+ // config file includes other config files, this function will recursively include those files as well.
+ // configString - the config string within which to look for "include" statements
+ // resolvedConfigFiles - the paths to all the config files that have already been resolved. This vector is used to prevent include loops,
+ // and to prevent files from being included multiple times.
+ // returns: The config string, with all the "include" statements replaced with the bodies of the specified config files.
+ std::string ConfigParser::ResolveIncludeStatements(const std::string &configString, std::vector &resolvedConfigFiles)
+ {
+ std::vector lines = msra::strfun::split(configString, "\n");
+ std::string includeKeyword = "include=";
+ std::size_t includeKeywordSize = includeKeyword.size();
+ std::string newConfigString;
+ for (std::string line : lines)
+ {
+ if (line.compare(0, includeKeywordSize, includeKeyword) == 0)
+ {
+ std::string filePaths = line.substr(includeKeywordSize, line.size() - includeKeywordSize);
+ if (filePaths.find(openBraceVar) != std::string::npos)
+ {
+ RuntimeError("Variable usage (eg, \"$varName$\") not supported in \"include\" statements. Explicit path to config file must be provided");
+ }
+
+ std::vector filePathVec = msra::strfun::split (filePaths, "+");
+ for (auto filePath : filePathVec)
+ {
+ // if file hasn't already been resolved (the resolvedPaths vector doesn't contain it), resolve it.
+ if (std::find(resolvedConfigFiles.begin(), resolvedConfigFiles.end(), filePath) == resolvedConfigFiles.end())
+ {
+ // Recursively resolve the include statements in the included config files.
+ // Ensure that the same config file isn't included twice, by keeping track of the config
+ // files that have already been resolved in the resolvedPaths vector.
+ resolvedConfigFiles.push_back(filePath);
+ newConfigString += ResolveIncludeStatements(
+ ReadConfigFile(filePath),
+ resolvedConfigFiles
+ );
+ }
+ else
+ {
+ // We already resolved this path. Write a warning so that user is aware of this.
+ // TODO: This message is written to stderr before stderr gets redirected to the specified file. Fix this.
+ fprintf(stderr, "Warning: Config file included multiple times. Not including config file again: %s", filePath.c_str());
+ }
+ }
+ }
+ else
+ {
+ newConfigString += (line + "\n");
+ }
+ }
+ return newConfigString;
+ }
+
+ // LoadConfigFiles - load multiple configuration file, and adds to config parameters
+ // filePaths - A "+" delimited list of file paths, corresponding to config files to load
+ // configStringToAppend - A config string which should be processed together with the config files
+ void ConfigParser::LoadConfigFiles(const std::wstring &filePaths, const std::string *configStringToAppend)
+ {
+ std::string configString = ReadConfigFiles(filePaths);
+ if(configStringToAppend != nullptr)
+ {
+ configString += *configStringToAppend;
+ }
+
+ FileParse(configString);
+ }
+
+ // LoadConfigFileAndResolveVariables - load a configuration file, and add to config parameters.
+ // If the config file contains references to variables, which are defined in the 'config' ConfigParameters,
+ // then this method will resolve those variables. This method is meant for the processing of NDL/MEL config files,
+ // in order to allow them to access variables defined in the primary config file via $varName$ syntax.
+ // filePath - filePath to the file to load
+ // config - These ConfigParameters are used in order to resolve the $varName$ instances in the config file.
+ void ConfigParser::LoadConfigFileAndResolveVariables(const std::wstring &filePath, const ConfigParameters& config)
+ {
+ // read file, resolve variables, and then parse.
+ std::string fileContents = ReadConfigFile(filePath);
+ fileContents = config.ResolveVariables(fileContents);
+ FileParse(fileContents);
+ }
+
+ // LoadConfigFile - load a configuration file, and add to config parameters
+ // filePath - filePath to the file to read
+ void ConfigParser::LoadConfigFile(const std::wstring &filePath)
+ {
+ // read and then parse
+ FileParse(ReadConfigFile(filePath));
+ }
+
+ // Same as "ReadConfigFiles" function below, but takes as input string instead of wstring
+ std::string ConfigParser::ReadConfigFiles(const std::string &filePaths)
+ {
+ return ReadConfigFiles(msra::strfun::utf16(filePaths));
+ }
+
+ // ReadConfigFiles - reads multiple config files, concatenates the content from each file, and returns a string
+ // filePaths - A "+" delimited list of file paths, corresponding to config files to read
+ // returns: a string with the concatentated file contents
+ std::string ConfigParser::ReadConfigFiles(const std::wstring &filePaths)
+ {
+ std::string configString;
+ std::vector filePathVec = msra::strfun::split (filePaths, L"+");
+ for (auto filePath : filePathVec)
+ {
+ configString += ReadConfigFile(filePath);
+ }
+ return configString;
+ }
+
+ // Same as "ReadConfigFile" function below, but takes as input string instead of wstring
+ std::string ConfigParser::ReadConfigFile(const std::string &filePath)
+ {
+ return ReadConfigFile(msra::strfun::utf16(filePath));
+ }
+
+ // ReadConfigFile - read a configuration file, and return as a string
+ // filePath - the path to the config file to read
+ // returns: a string with the concatentated file contents
+ std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
+ {
+ File file(filePath, fileOptionsRead);
+
+ // initialize with file name
+ std::string path = msra::strfun::utf8(filePath);
+ auto location = path.find_last_of("/\\");
+ if (location != npos)
+ path = path.substr(location+1);
+ m_configName = move(path);
+
+ // read the entire file into a string
+ // CONSIDER: should the File API support this, instead of line by line?
+ size_t fileLength = file.Size();
+ string str;
+ string configFile;
+ configFile.reserve(fileLength);
+ while (!file.IsEOF())
+ {
+ file.GetLine(str);
+ str = PreprocessConfigLine(str);
+ if (str != "")
+ {
+ configFile.append(str);
+ configFile.append("\n");
+ }
+ }
+ return configFile;
+ }
+
+ // GetFileConfigNames - determine the names of the features and labels sections in the config file
+ // features - [in,out] a vector of feature name strings
+ // labels - [in,out] a vector of label name strings
+ void GetFileConfigNames(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels)
+ {
+ for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
+ {
+ auto pair = *iter;
+ ConfigParameters temp (iter->second);
+ // see if we have a config parameters that contains a "dim" element, it's a sub key, use it
+ if (temp.ExistsCurrent("dim"))
+ {
+ if (temp.ExistsCurrent("labelMappingFile")
+ || temp.ExistsCurrent("labelDim")
+ || temp.ExistsCurrent("labelType")
+ || (temp.ExistsCurrent("sectionType") && temp("sectionType") == "labels"))
+ {
+ labels.push_back(msra::strfun::utf16(iter->first));
+ }
+ else
+ {
+ features.push_back(msra::strfun::utf16(iter->first));
+ }
+ }
+ }
+ }
+
+ // FindConfigNames - determine the names of the heirarchy of sections in the config file that contain a particular key
+ // config - configuration to search
+ // key - string we ar searching for in each config section
+ // names - [in,out] a vector of section names in "path" format (i.e. base\subsection)
+ void FindConfigNames(const ConfigParameters& config, std::string key, std::vector& names)
+ {
+ for (auto iter = config.begin(); iter != config.end(); ++iter)
+ {
+ auto pair = *iter;
+ ConfigParameters temp (iter->second);
+ // see if we have a config parameters that contains a "key" element, if so use it
+ if (temp.ExistsCurrent(key))
+ {
+ names.push_back(msra::strfun::utf16(iter->first));
+ }
+ }
+ }
+
+ // Trim - trim white space off the start and end of the string
+ // str - string to trim
+ // NOTE: if the entire string is empty, then the string will be set to an empty string
+ void Trim(std::string& str)
+ {
+ auto found = str.find_first_not_of(" \t");
+ if (found == npos)
+ {
+ str.erase(0);
+ return;
+ }
+ str.erase(0, found);
+ found = str.find_last_not_of(" \t");
+ if (found != npos)
+ str.erase(found+1);
+ }
+
}}}
\ No newline at end of file
diff --git a/Common/File.cpp b/Common/File.cpp
index fc77c0af3..896b5dd22 100644
--- a/Common/File.cpp
+++ b/Common/File.cpp
@@ -1,631 +1,633 @@
-//
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "basetypes.h"
-#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
-#include "fileutil.h"
-#include "File.h"
-#include
-#include
-#include
-#ifdef _WIN32
-#include
-#endif
-#ifdef __unix__
-#include
-#endif
-
-namespace Microsoft{ namespace MSR { namespace CNTK {
-
-// File creation
-// filename - the path
-// fileOptions - options to open the file
-File::File(const std::wstring& filename, int fileOptions)
-{
- Init(filename.c_str(), fileOptions);
-}
-
-File::File(const std::string& filename, int fileOptions)
-{
- // this converts from string to wstring, and then to wchar_t*
- Init(msra::strfun::utf16(filename).c_str(), fileOptions);
-}
-
-File::File(const wchar_t* filename, int fileOptions)
-{
- Init(filename, fileOptions);
-}
-
-void File::Init(const wchar_t* filename, int fileOptions)
-{
- msra::files::make_intermediate_dirs(filename);
- // translate the options string into a string for fopen()
- wstring options = fileOptions&fileOptionsRead?L"r":L"";
- if (fileOptions&fileOptionsWrite)
- {
- // if we already are reading the file, change to read/write
- options.clear();
- options.append(L"w+");
- }
- if (fileOptions&fileOptionsBinary)
- {
- options += L"b";
- }
- else
- {
- if (fileOptions & fileOptionsUnicode)
- options += L"b";
- else
- options += L"t";
- // I attempted to use the translated characterset modes, but encountered strange errors
- //options += L"t, ccs=";
- //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8";
- }
- // add sequential flag to allocate big read buffer
- if (fileOptions & fileOptionsSequential)
- options += L"S";
-
- attempt([=](){m_file = fopenOrDie(filename, options.c_str());});
- m_options = fileOptions;
- m_size = filesize(m_file);
-}
-
-void File::goToDelimiter(int delim)
-{
- int ch=0;
-
- while (ch!=delim) {
- ch=fgetc(m_file);
- if (feof(m_file)) {
- printf("Unexpected end of file\n");
- throw std::logic_error("Unexpected end of file\n");
- }
- }
-}
-
-bool File::IsTextBased()
-{
- return !!(m_options & (fileOptionsText|fileOptionsUnicode));
-}
-
-// File Destructor
-// closes the file
-File::~File(void)
-{
- attempt([=] {fcloseOrDie(m_file);});
-}
-
-// GetLine - get a line from the file
-// str - string to store the line
-void File::GetLine(wstring& str)
-{
- str = fgetlinew(m_file);
-}
-
-// GetLine - get a line from the file
-// str - string
-void File::GetLine(string& str)
-{
- str = fgetline(m_file);
-}
-
-// Put a zero/space terminated wstring into a file
-// val - value to write to the file
-File& File::operator<<(const std::wstring& val)
-{
- WriteString(val.c_str());
- return *this;
-}
-
-
-// Put a zero/space terminated string into a file
-// val - value to write to the file
-File& File::operator<<(const std::string& val)
-{
- WriteString(val.c_str());
- return *this;
-}
-
-// Put a marker in the file, the marker depends on the file type
-// marker - marker to place in the file
-File& File::operator<<(FileMarker marker)
-{
- File& file = *this;
- switch(marker)
- {
- case fileMarkerBeginFile: // beginning of file marker
- // only exists for UNICODE files
- if (m_options & fileOptionsUnicode)
- file << (unsigned int)0xfeff; // byte order mark
- break;
- case fileMarkerEndFile: // end of file marker
- // use ^Z for end of file for text files
- if (m_options & fileOptionsUnicode)
- file << wchar_t(26); // ^Z
- else if (m_options & fileOptionsText)
- file << char(26);
- break;
- case fileMarkerBeginList: // Beginning of list marker
- // no marker written for either
- break;
- case fileMarkerListSeparator: // separate elements of a list
- // do nothing for now, built in space deliminter for all types (before type)
- // future: make this customizable, so you can specify a separator (i.e. ',')
- break;
- case fileMarkerEndList: // end of line/list marker
- if (m_options & fileOptionsUnicode)
- file.WriteString(L"\r\n"); // carriage return/life feed
- else if (m_options & fileOptionsText)
- file.WriteString("\r\n");
- break;
- case fileMarkerBeginSection: // beginning of section
- case fileMarkerEndSection: // end of section
- assert(false); // sections should use a string modifier
- break;
- }
- return file;
-}
-
-// PutMarker for beginning of list support (lists with a count)
-// count - [in] the number of elements in the list
-File& File::PutMarker(FileMarker marker, size_t count)
-{
- assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count markers
- *this << count;
- return *this;
-}
-
-// PutMarker for section beginning and ending tags
-// section - [in]name of section
-File& File::PutMarker(FileMarker marker, const std::string& section)
-{
- File& file = *this;
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- file << section;
- return file;
-}
-
-// PutMarker for section beginning and ending tags
-// section - [in]name of section
-File& File::PutMarker(FileMarker marker, const std::wstring& section)
-{
- File& file = *this;
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- file << section;
- return file;
-}
-
-// Get a zero terminated wstring from a file
-// val - value to read from the file
-File& File::operator>>(std::wstring& val)
-{
- attempt([&]{
- if (IsTextBased())
- val = fgetwtoken(m_file);
- else
- val = fgetwstring(m_file);
- });
- return *this;
-}
-
-// Get a zero terminated string from a file
-// val - value to read from the file
-File& File::operator>>(std::string& val)
-{
- attempt([&]{
- if (IsTextBased())
- val = fgettoken(m_file);
- else
- val = fgetstring(m_file);
- });
- return *this;
-}
-
-// ReadChars - read a specified number of characters, and reset read pointer if requested
-// val - [in,out] return value will be returned here
-// cnt - number of characters to read
-// reset - reset the read pointer
-void File::ReadChars(std::string& val, size_t cnt, bool reset)
-{
- size_t pos = 0; // (initialize to keep compiler happy)
- if (reset)
- pos = GetPosition();
- val.resize(cnt);
- char *str = const_cast(val.c_str());
- for (int i=0;i < cnt;++i)
- *this >> str[i];
- if (reset)
- SetPosition(pos);
-}
-
-// ReadChars - read a specified number of characters, and reset read pointer if requested
-// val - [in,out] return value will be returned here
-// cnt - number of characters to read
-// reset - reset the read pointer
-void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
-{
- size_t pos = 0; // (initialize to keep compiler happy)
- if (reset)
- pos = GetPosition();
- val.resize(cnt);
- wchar_t *str = const_cast(val.c_str());
- for (int i=0;i < cnt;++i)
- *this >> str[i];
- if (reset)
- SetPosition(pos);
-}
-
-// WriteString - outputs a string into the file
-// str - the string to output
-// size - size of the string to output, if zero null terminated
-void File::WriteString(const char* str, int size)
-{
- attempt([&]{
- if (size > 0)
- {
- fwprintf(m_file, L" %.*hs", size, str);
- }
- else
- {
- if (IsTextBased())
- fwprintf(m_file, L" %hs", str);
- else
- fputstring (m_file, str);
- }
- });
-}
-
-// ReadString - reads a string into the file
-// str - the string buffer to read the string into
-// size - size of the string string buffer
-void File::ReadString(char* str, int size)
-{
- attempt([&]{
- if (IsTextBased())
- fgettoken(m_file, str, size);
- else
- fgetstring (m_file, str, size);
- });
-}
-
-// WriteString - outputs a string into the file
-// if writing to text based file and spaces are embedded, writes quotes around string
-// str - the string to output
-// size - size of the string to output, if zero null terminated
-void File::WriteString(const wchar_t* str, int size)
-{
- attempt([&]{
-#ifdef EMBEDDED_SPACES
- // start of implementation of embedded space support with quoting
- // not complete, not sure if we need it
- bool spacefound = false;
- wchar_t quote = 0;
- if (IsTextBased())
- {
- // search for embedded spaces and quotes
- wstring searchString = L" \"'~";
- const wchar_t* result = NULL;
- while (result = wcspbrk(str, searchString.c_str()))
- {
- if (IsWhiteSpace(*result))
- spacefound = true;
- searchString.find(*result, 0);
- }
- }
-#endif
- if (size > 0)
- {
- fwprintf(m_file, L" %.*ls", size, str);
- }
- else
- {
- if (IsTextBased())
- fwprintf(m_file, L" %ls", str);
- else
- fputstring (m_file, str);
- }
- });
-}
-
-// ReadString - reads a string into the file
-// str - the string buffer to read the string into
-// size - size of the string string buffer
-void File::ReadString(wchar_t* str, int size)
-{
- attempt([&]{
- if (IsTextBased())
- fgettoken(m_file, str, size);
- else
- fgetstring (m_file, str, size);
- });
-}
-
-// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
-// skip - skip the BOM mark if found (defaults to false)
-// returns - true if on a unicode BOM
-bool File::IsUnicodeBOM(bool skip)
-{
- File& file = *this;
- uint64_t pos = GetPosition();
- // if we aren't at the beginning of the file, it can't be the byte order mark
- if (pos != 0)
- return false;
-
- // only exists for UNICODE files
- bool found = false;
- if (m_options & fileOptionsUnicode)
- {
- unsigned int bom=0;
- if (IsTextBased())
- ftrygetText(m_file, bom);
- else
- fget(m_file, bom);
- // future: one reason for the BOM is to detect other-endian files, should we support?
- found = (bom == 0xfeff);
- }
- else if (m_options & fileOptionsText)
- {
- char val[3];
- file.ReadString(val, 3);
- found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF);
- }
- // restore pointer if no BOM or we aren't skipping it
- if (!found || !skip)
- {
- SetPosition(pos);
- }
- return found;
-}
-
-//Size - return the size of the file
-// WARNING: calling this will reset the EOF marker, so do so with care
-size_t File::Size()
-{
- return filesize(m_file);
-}
-
-// IsEOF - if we have read past the end of the file
-// return - true if end of file has been found
-bool File::IsEOF()
-{
- return !!feof(m_file);
-}
-
-// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
-// skip - skip the whitespace if found (defaults to false)
-// returns - true if whitespace found
-bool File::IsWhiteSpace(bool skip)
-{
- bool spaceFound = false;
- bool spaceCur = false;
- if (m_options & fileOptionsUnicode)
- {
- wint_t c;
- do
- {
- c = fgetwc (m_file);
- if (c == WEOF) // hit the end
- return spaceFound;
- spaceCur = !!iswspace(c);
- spaceFound = spaceFound || spaceCur;
- } while (spaceCur && skip);
- // put back the last character (WEOF is ignored)
- ungetwc(c, m_file);
- }
- else
- {
- int c;
- do
- {
- c = fgetc (m_file);
- if (c == EOF) // hit the end
- return spaceFound;
- spaceCur = !!isspace(c);
- spaceFound = spaceFound || spaceCur;
- } while (spaceCur && skip);
- // put back the last character (EOF is ignored)
- ungetc(c, m_file);
- }
-
- return spaceFound;
-}
-
-// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
-// skip - skip the end of line if found (defaults to false)
-// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
-int File::EndOfLineOrEOF(bool skip)
-{
- int found = false;
- if (m_options & fileOptionsUnicode)
- found = fskipwNewline(m_file,skip);
- else if (m_options & fileOptionsText)
- found = fskipNewline(m_file, skip);
- return found;
-}
-
-
-// Get a marker from the file
-// some are ignored others are expecting characters
-// must use GetMarker methods for those that require parameters
-File& File::operator>>(FileMarker marker)
-{
- File& file = *this;
-
- switch(marker)
- {
- case fileMarkerBeginFile: // beginning of file marker
- // check for Unicode BOM marker
- if (IsTextBased())
- IsUnicodeBOM(true);
- break;
- case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
- if (!IsEOF())
- throw std::runtime_error("fileMarkerEndFile not found");
- break;
- case fileMarkerBeginList: // Beginning of list marker
- // no marker written unless an list with a count header
- break;
- case fileMarkerListSeparator: // separate elements of a list
- // do nothing for now, built in space deliminter for all types (before type)
- // future: make this customizable, so you can specify a separator (i.e. ',')
- break;
- case fileMarkerEndList: // end of line/list marker
- if (IsTextBased())
- {
- int found = EndOfLineOrEOF(true);
- if (found != (int)true) // EOF can also be returned
- throw std::runtime_error("Newline not found");
- }
- break;
- case fileMarkerBeginSection: // beginning of section
- case fileMarkerEndSection: // end of section
- assert(false); // sections should use a string modifier
- break;
- }
- return file;
-}
-
-// Get a marker from the file
-// some are ignored others are expecting characters
-// must use GetMarker methods for those that require parameters
-bool File::IsMarker(FileMarker marker, bool skip)
-{
- bool retval = false;
- switch(marker)
- {
- case fileMarkerBeginFile: // beginning of file marker
- // check for Unicode BOM marker
- retval = IsUnicodeBOM(skip);
- break;
- case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
- retval = IsEOF();
- break;
- case fileMarkerBeginList: // Beginning of list marker
- // no marker written unless an list with a count header
- // should we try to validate BOL header (just know it's an int, not negative, etc.)
- break;
- case fileMarkerListSeparator: // separate elements of a list
- // do nothing for now, built in space deliminter for all types (before type)
- // future: make this customizable, so you can specify a separator (i.e. ',')
- break;
- case fileMarkerEndList: // end of line/list marker
- if (IsTextBased())
- {
- int eolSeen = false;
- eolSeen = EndOfLineOrEOF(skip);
- retval = (eolSeen == (int)true);
- }
- break;
- case fileMarkerBeginSection: // beginning of section
- case fileMarkerEndSection: // end of section
- // can't destinquish from a string currently
- break;
- }
- return retval;
-}
-
-
-// GetMarker for beginning of list support (lists with a count)
-// count - [out] returns the number of elements in the list
-File& File::GetMarker(FileMarker marker, size_t& count)
-{
- assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers
- // use text based try, so it can fail without an exception
- if (IsTextBased())
- ftrygetText(m_file, count);
- else
- fget(m_file, count);
- return *this;
-}
-
-// GetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-File& File::GetMarker(FileMarker marker, const std::string& section)
-{
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- string str;
- *this >> str;
- if (str != section)
- throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section);
- return *this;
-}
-
-// GetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-File& File::GetMarker(FileMarker marker, const std::wstring& section)
-{
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- wstring str;
- *this >> str;
- if (str != section)
- throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section));
- return *this;
-}
-
-// TryGetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
-{
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- size_t pos = GetPosition();
- std::wstring str;
- try
- {
- *this >> str;
- if (str == section)
- return true;
- }
- catch(...)
- {
- //eat
- }
- SetPosition(pos);
- return false;
-}
-
-// TryGetMarker for section beginning and ending tags
-// section - [in]name of section that is expected
-bool File::TryGetMarker(FileMarker marker, const std::string& section)
-{
- // only the section markers take a string parameter
- assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
- size_t pos = GetPosition();
- std::string str;
- try
- {
- *this >> str;
- if (str == section)
- return true;
- }
- catch(...)
- {
- return false;
- }
- SetPosition(pos);
- return false;
-}
-
-// GetPosition - Get position in a file
-uint64_t File::GetPosition()
-{
- return fgetpos(m_file);
-}
-
-// Set the position in the file
-// pos - position in the file
-void File::SetPosition(uint64_t pos)
-{
- fsetpos (m_file, pos);
-}
-
-}}}
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
+#include "basetypes.h"
+#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
+#include "fileutil.h"
+#include "File.h"
+#include
+#include
+#include
+#ifdef _WIN32
+#include
+#endif
+#ifdef __unix__
+#include
+#endif
+
+namespace Microsoft{ namespace MSR { namespace CNTK {
+
+// File creation
+// filename - the path
+// fileOptions - options to open the file
+File::File(const std::wstring& filename, int fileOptions)
+{
+ Init(filename.c_str(), fileOptions);
+}
+
+File::File(const std::string& filename, int fileOptions)
+{
+ // this converts from string to wstring, and then to wchar_t*
+ Init(msra::strfun::utf16(filename).c_str(), fileOptions);
+}
+
+File::File(const wchar_t* filename, int fileOptions)
+{
+ Init(filename, fileOptions);
+}
+
+void File::Init(const wchar_t* filename, int fileOptions)
+{
+ msra::files::make_intermediate_dirs(filename);
+ // translate the options string into a string for fopen()
+ wstring options = fileOptions&fileOptionsRead?L"r":L"";
+ if (fileOptions&fileOptionsWrite)
+ {
+ // if we already are reading the file, change to read/write
+ options.clear();
+ options.append(L"w+");
+ }
+ if (fileOptions&fileOptionsBinary)
+ {
+ options += L"b";
+ }
+ else
+ {
+ if (fileOptions & fileOptionsUnicode)
+ options += L"b";
+ else
+ options += L"t";
+ // I attempted to use the translated characterset modes, but encountered strange errors
+ //options += L"t, ccs=";
+ //options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8";
+ }
+ // add sequential flag to allocate big read buffer
+ if (fileOptions & fileOptionsSequential)
+ options += L"S";
+
+ attempt([=](){m_file = fopenOrDie(filename, options.c_str());});
+ m_options = fileOptions;
+ m_size = filesize(m_file);
+}
+
+void File::goToDelimiter(int delim)
+{
+ int ch=0;
+
+ while (ch!=delim) {
+ ch=fgetc(m_file);
+ if (feof(m_file)) {
+ printf("Unexpected end of file\n");
+ throw std::logic_error("Unexpected end of file\n");
+ }
+ }
+}
+
+bool File::IsTextBased()
+{
+ return !!(m_options & (fileOptionsText|fileOptionsUnicode));
+}
+
+// File Destructor
+// closes the file
+File::~File(void)
+{
+ attempt([=] {fcloseOrDie(m_file);});
+}
+
+// GetLine - get a line from the file
+// str - string to store the line
+void File::GetLine(wstring& str)
+{
+ str = fgetlinew(m_file);
+}
+
+// GetLine - get a line from the file
+// str - string
+void File::GetLine(string& str)
+{
+ str = fgetline(m_file);
+}
+
+// Put a zero/space terminated wstring into a file
+// val - value to write to the file
+File& File::operator<<(const std::wstring& val)
+{
+ WriteString(val.c_str());
+ return *this;
+}
+
+
+// Put a zero/space terminated string into a file
+// val - value to write to the file
+File& File::operator<<(const std::string& val)
+{
+ WriteString(val.c_str());
+ return *this;
+}
+
+// Put a marker in the file, the marker depends on the file type
+// marker - marker to place in the file
+File& File::operator<<(FileMarker marker)
+{
+ File& file = *this;
+ switch(marker)
+ {
+ case fileMarkerBeginFile: // beginning of file marker
+ // only exists for UNICODE files
+ if (m_options & fileOptionsUnicode)
+ file << (unsigned int)0xfeff; // byte order mark
+ break;
+ case fileMarkerEndFile: // end of file marker
+ // use ^Z for end of file for text files
+ if (m_options & fileOptionsUnicode)
+ file << wchar_t(26); // ^Z
+ else if (m_options & fileOptionsText)
+ file << char(26);
+ break;
+ case fileMarkerBeginList: // Beginning of list marker
+ // no marker written for either
+ break;
+ case fileMarkerListSeparator: // separate elements of a list
+ // do nothing for now, built in space deliminter for all types (before type)
+ // future: make this customizable, so you can specify a separator (i.e. ',')
+ break;
+ case fileMarkerEndList: // end of line/list marker
+ if (m_options & fileOptionsUnicode)
+ file.WriteString(L"\r\n"); // carriage return/life feed
+ else if (m_options & fileOptionsText)
+ file.WriteString("\r\n");
+ break;
+ case fileMarkerBeginSection: // beginning of section
+ case fileMarkerEndSection: // end of section
+ assert(false); // sections should use a string modifier
+ break;
+ }
+ return file;
+}
+
+// PutMarker for beginning of list support (lists with a count)
+// count - [in] the number of elements in the list
+File& File::PutMarker(FileMarker marker, size_t count)
+{
+ assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count markers
+ *this << count;
+ return *this;
+}
+
+// PutMarker for section beginning and ending tags
+// section - [in]name of section
+File& File::PutMarker(FileMarker marker, const std::string& section)
+{
+ File& file = *this;
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ file << section;
+ return file;
+}
+
+// PutMarker for section beginning and ending tags
+// section - [in]name of section
+File& File::PutMarker(FileMarker marker, const std::wstring& section)
+{
+ File& file = *this;
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ file << section;
+ return file;
+}
+
+// Get a zero terminated wstring from a file
+// val - value to read from the file
+File& File::operator>>(std::wstring& val)
+{
+ attempt([&]{
+ if (IsTextBased())
+ val = fgetwtoken(m_file);
+ else
+ val = fgetwstring(m_file);
+ });
+ return *this;
+}
+
+// Get a zero terminated string from a file
+// val - value to read from the file
+File& File::operator>>(std::string& val)
+{
+ attempt([&]{
+ if (IsTextBased())
+ val = fgettoken(m_file);
+ else
+ val = fgetstring(m_file);
+ });
+ return *this;
+}
+
+// ReadChars - read a specified number of characters, and reset read pointer if requested
+// val - [in,out] return value will be returned here
+// cnt - number of characters to read
+// reset - reset the read pointer
+void File::ReadChars(std::string& val, size_t cnt, bool reset)
+{
+ size_t pos = 0; // (initialize to keep compiler happy)
+ if (reset)
+ pos = GetPosition();
+ val.resize(cnt);
+ char *str = const_cast(val.c_str());
+ for (int i=0;i < cnt;++i)
+ *this >> str[i];
+ if (reset)
+ SetPosition(pos);
+}
+
+// ReadChars - read a specified number of characters, and reset read pointer if requested
+// val - [in,out] return value will be returned here
+// cnt - number of characters to read
+// reset - reset the read pointer
+void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
+{
+ size_t pos = 0; // (initialize to keep compiler happy)
+ if (reset)
+ pos = GetPosition();
+ val.resize(cnt);
+ wchar_t *str = const_cast(val.c_str());
+ for (int i=0;i < cnt;++i)
+ *this >> str[i];
+ if (reset)
+ SetPosition(pos);
+}
+
+// WriteString - outputs a string into the file
+// str - the string to output
+// size - size of the string to output, if zero null terminated
+void File::WriteString(const char* str, int size)
+{
+ attempt([&]{
+ if (size > 0)
+ {
+ fwprintf(m_file, L" %.*hs", size, str);
+ }
+ else
+ {
+ if (IsTextBased())
+ fwprintf(m_file, L" %hs", str);
+ else
+ fputstring (m_file, str);
+ }
+ });
+}
+
+// ReadString - reads a string into the file
+// str - the string buffer to read the string into
+// size - size of the string string buffer
+void File::ReadString(char* str, int size)
+{
+ attempt([&]{
+ if (IsTextBased())
+ fgettoken(m_file, str, size);
+ else
+ fgetstring (m_file, str, size);
+ });
+}
+
+// WriteString - outputs a string into the file
+// if writing to text based file and spaces are embedded, writes quotes around string
+// str - the string to output
+// size - size of the string to output, if zero null terminated
+void File::WriteString(const wchar_t* str, int size)
+{
+ attempt([&]{
+#ifdef EMBEDDED_SPACES
+ // start of implementation of embedded space support with quoting
+ // not complete, not sure if we need it
+ bool spacefound = false;
+ wchar_t quote = 0;
+ if (IsTextBased())
+ {
+ // search for embedded spaces and quotes
+ wstring searchString = L" \"'~";
+ const wchar_t* result = NULL;
+ while (result = wcspbrk(str, searchString.c_str()))
+ {
+ if (IsWhiteSpace(*result))
+ spacefound = true;
+ searchString.find(*result, 0);
+ }
+ }
+#endif
+ if (size > 0)
+ {
+ fwprintf(m_file, L" %.*ls", size, str);
+ }
+ else
+ {
+ if (IsTextBased())
+ fwprintf(m_file, L" %ls", str);
+ else
+ fputstring (m_file, str);
+ }
+ });
+}
+
+// ReadString - reads a string into the file
+// str - the string buffer to read the string into
+// size - size of the string string buffer
+void File::ReadString(wchar_t* str, int size)
+{
+ attempt([&]{
+ if (IsTextBased())
+ fgettoken(m_file, str, size);
+ else
+ fgetstring (m_file, str, size);
+ });
+}
+
+// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
+// skip - skip the BOM mark if found (defaults to false)
+// returns - true if on a unicode BOM
+bool File::IsUnicodeBOM(bool skip)
+{
+ File& file = *this;
+ uint64_t pos = GetPosition();
+ // if we aren't at the beginning of the file, it can't be the byte order mark
+ if (pos != 0)
+ return false;
+
+ // only exists for UNICODE files
+ bool found = false;
+ if (m_options & fileOptionsUnicode)
+ {
+ unsigned int bom=0;
+ if (IsTextBased())
+ ftrygetText(m_file, bom);
+ else
+ fget(m_file, bom);
+ // future: one reason for the BOM is to detect other-endian files, should we support?
+ found = (bom == 0xfeff);
+ }
+ else if (m_options & fileOptionsText)
+ {
+ char val[3];
+ file.ReadString(val, 3);
+ found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF);
+ }
+ // restore pointer if no BOM or we aren't skipping it
+ if (!found || !skip)
+ {
+ SetPosition(pos);
+ }
+ return found;
+}
+
+//Size - return the size of the file
+// WARNING: calling this will reset the EOF marker, so do so with care
+size_t File::Size()
+{
+ return filesize(m_file);
+}
+
+// IsEOF - if we have read past the end of the file
+// return - true if end of file has been found
+bool File::IsEOF()
+{
+ return !!feof(m_file);
+}
+
+// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
+// skip - skip the whitespace if found (defaults to false)
+// returns - true if whitespace found
+bool File::IsWhiteSpace(bool skip)
+{
+ bool spaceFound = false;
+ bool spaceCur = false;
+ if (m_options & fileOptionsUnicode)
+ {
+ wint_t c;
+ do
+ {
+ c = fgetwc (m_file);
+ if (c == WEOF) // hit the end
+ return spaceFound;
+ spaceCur = !!iswspace(c);
+ spaceFound = spaceFound || spaceCur;
+ } while (spaceCur && skip);
+ // put back the last character (WEOF is ignored)
+ ungetwc(c, m_file);
+ }
+ else
+ {
+ int c;
+ do
+ {
+ c = fgetc (m_file);
+ if (c == EOF) // hit the end
+ return spaceFound;
+ spaceCur = !!isspace(c);
+ spaceFound = spaceFound || spaceCur;
+ } while (spaceCur && skip);
+ // put back the last character (EOF is ignored)
+ ungetc(c, m_file);
+ }
+
+ return spaceFound;
+}
+
+// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
+// skip - skip the end of line if found (defaults to false)
+// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
+int File::EndOfLineOrEOF(bool skip)
+{
+ int found = false;
+ if (m_options & fileOptionsUnicode)
+ found = fskipwNewline(m_file,skip);
+ else if (m_options & fileOptionsText)
+ found = fskipNewline(m_file, skip);
+ return found;
+}
+
+
+// Get a marker from the file
+// some are ignored others are expecting characters
+// must use GetMarker methods for those that require parameters
+File& File::operator>>(FileMarker marker)
+{
+ File& file = *this;
+
+ switch(marker)
+ {
+ case fileMarkerBeginFile: // beginning of file marker
+ // check for Unicode BOM marker
+ if (IsTextBased())
+ IsUnicodeBOM(true);
+ break;
+ case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
+ if (!IsEOF())
+ throw std::runtime_error("fileMarkerEndFile not found");
+ break;
+ case fileMarkerBeginList: // Beginning of list marker
+ // no marker written unless an list with a count header
+ break;
+ case fileMarkerListSeparator: // separate elements of a list
+ // do nothing for now, built in space deliminter for all types (before type)
+ // future: make this customizable, so you can specify a separator (i.e. ',')
+ break;
+ case fileMarkerEndList: // end of line/list marker
+ if (IsTextBased())
+ {
+ int found = EndOfLineOrEOF(true);
+ if (found != (int)true) // EOF can also be returned
+ throw std::runtime_error("Newline not found");
+ }
+ break;
+ case fileMarkerBeginSection: // beginning of section
+ case fileMarkerEndSection: // end of section
+ assert(false); // sections should use a string modifier
+ break;
+ }
+ return file;
+}
+
+// Get a marker from the file
+// some are ignored others are expecting characters
+// must use GetMarker methods for those that require parameters
+bool File::IsMarker(FileMarker marker, bool skip)
+{
+ bool retval = false;
+ switch(marker)
+ {
+ case fileMarkerBeginFile: // beginning of file marker
+ // check for Unicode BOM marker
+ retval = IsUnicodeBOM(skip);
+ break;
+ case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
+ retval = IsEOF();
+ break;
+ case fileMarkerBeginList: // Beginning of list marker
+ // no marker written unless an list with a count header
+ // should we try to validate BOL header (just know it's an int, not negative, etc.)
+ break;
+ case fileMarkerListSeparator: // separate elements of a list
+ // do nothing for now, built in space deliminter for all types (before type)
+ // future: make this customizable, so you can specify a separator (i.e. ',')
+ break;
+ case fileMarkerEndList: // end of line/list marker
+ if (IsTextBased())
+ {
+ int eolSeen = false;
+ eolSeen = EndOfLineOrEOF(skip);
+ retval = (eolSeen == (int)true);
+ }
+ break;
+ case fileMarkerBeginSection: // beginning of section
+ case fileMarkerEndSection: // end of section
+ // can't destinquish from a string currently
+ break;
+ }
+ return retval;
+}
+
+
+// GetMarker for beginning of list support (lists with a count)
+// count - [out] returns the number of elements in the list
+File& File::GetMarker(FileMarker marker, size_t& count)
+{
+ assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers
+ // use text based try, so it can fail without an exception
+ if (IsTextBased())
+ ftrygetText(m_file, count);
+ else
+ fget(m_file, count);
+ return *this;
+}
+
+// GetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+File& File::GetMarker(FileMarker marker, const std::string& section)
+{
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ string str;
+ *this >> str;
+ if (str != section)
+ throw std::runtime_error(std::string("section name mismatch ") + str + " != " + section);
+ return *this;
+}
+
+// GetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+File& File::GetMarker(FileMarker marker, const std::wstring& section)
+{
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ wstring str;
+ *this >> str;
+ if (str != section)
+ throw std::runtime_error(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section));
+ return *this;
+}
+
+// TryGetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
+{
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ size_t pos = GetPosition();
+ std::wstring str;
+ try
+ {
+ *this >> str;
+ if (str == section)
+ return true;
+ }
+ catch(...)
+ {
+ //eat
+ }
+ SetPosition(pos);
+ return false;
+}
+
+// TryGetMarker for section beginning and ending tags
+// section - [in]name of section that is expected
+bool File::TryGetMarker(FileMarker marker, const std::string& section)
+{
+ // only the section markers take a string parameter
+ assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
+ size_t pos = GetPosition();
+ std::string str;
+ try
+ {
+ *this >> str;
+ if (str == section)
+ return true;
+ }
+ catch(...)
+ {
+ return false;
+ }
+ SetPosition(pos);
+ return false;
+}
+
+// GetPosition - Get position in a file
+uint64_t File::GetPosition()
+{
+ return fgetpos(m_file);
+}
+
+// Set the position in the file
+// pos - position in the file
+void File::SetPosition(uint64_t pos)
+{
+ fsetpos (m_file, pos);
+}
+
+}}}
diff --git a/Common/Include/TimerUtility.h b/Common/Include/TimerUtility.h
new file mode 100644
index 000000000..c964f4282
--- /dev/null
+++ b/Common/Include/TimerUtility.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#define MS_PER_SEC 1000
+
+namespace Microsoft{namespace MSR {namespace CNTK {
+ class Timer
+ {
+ public:
+ Timer(){};
+ ~Timer(){};
+ static unsigned long long MilliSecondElapsed();
+ };
+}}}
diff --git a/Common/TimerUtility.cpp b/Common/TimerUtility.cpp
new file mode 100644
index 000000000..f0fe29342
--- /dev/null
+++ b/Common/TimerUtility.cpp
@@ -0,0 +1,39 @@
+#include "TimerUtility.h"
+
+#ifdef WIN32
+#include
+#else
+#include
+#endif
+namespace Microsoft{
+ namespace MSR {
+ namespace CNTK {
+
+ //Returns the amount of milliseconds elapsed
+ unsigned long long Timer::MilliSecondElapsed()
+ {
+#ifdef WIN32
+ FILETIME ft;
+ LARGE_INTEGER li;
+
+ GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
+ li.LowPart = ft.dwLowDateTime;
+ li.HighPart = ft.dwHighDateTime;
+
+ unsigned long long ret = li.QuadPart;
+ ret -= 116444736000000000LL; // Make the values consistent with Linux.
+ ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3)
+
+ return ret;
+#else
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
+
+ UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
+
+ return ret;
+#endif
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/Common/fileutil.cpp b/Common/fileutil.cpp
index 4b2e3c565..9f6b6b134 100644
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@@ -4,7 +4,10 @@
//
//
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#pragma warning (disable: 4996) // ^^ this does not seem to work--TODO: make it work
#define _FILE_OFFSET_BITS 64 // to force fseeko() and ftello() 64 bit in Linux
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp
index ebb659525..2269d8779 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@@ -49,17 +49,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_truncated = readerConfig("Truncated", "false");
m_convertLabelsToTargets = false;
- m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+ ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
+ m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;
- if (m_numberOfuttsPerMinibatch < 1)
+ for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
{
- LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+ m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
+ if (m_numberOfuttsPerMinibatch < 1)
+ {
+ LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+ }
+
+ if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
+ {
+ LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
+ }
}
- if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
- {
- LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
- }
+ m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// get the read method, defaults to "blockRandomize" other option is "rollingWindow"
std::string readMethod(readerConfig("readMethod","blockRandomize"));
+ if (readMethod == "blockRandomize" && randomize == randomizeNone)
+ {
+ fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
+ randomize = randomizeAuto;
+ }
+
// see if they want to use readAhead
m_readAhead = readerConfig("readAhead", "false");
@@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// now get the frame source. This has better randomization and doesn't create temp files
m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
+ m_frameSource->setverbosity(verbosity);
//m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);
}
@@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
m_mbSize = mbSize;
+ m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
+
+ m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
+ m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
+ m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
+ m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
+ m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
+
if (m_trainOrTest)
{
StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.h b/DataReader/HTKMLFReader/HTKMLFReader.h
index 3b7692f4b..a4e90da3d 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
//
//
// Copyright (c) Microsoft Corporation. All rights reserved.
@@ -111,4 +112,117 @@ public:
void SetSentenceEnd(int /*actualMbSize*/){};
};
+=======
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
+#pragma once
+#include "DataReader.h"
+#include "commandArgUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template
+class HTKMLFReader : public IDataReader
+{
+private:
+ msra::dbn::minibatchiterator* m_mbiter;
+ msra::dbn::minibatchsource* m_frameSource;
+ msra::dbn::minibatchreadaheadsource* m_readAheadSource;
+ msra::dbn::FileEvalSource* m_fileEvalSource;
+ msra::dbn::latticesource* m_lattices;
+ map m_latticeMap;
+
+ vector m_sentenceEnd;
+ bool m_readAhead;
+ bool m_truncated;
+ vector m_processedFrame;
+ intargvector m_numberOfuttsPerMinibatchForAllEpochs;
+ size_t m_numberOfuttsPerMinibatch;
+ size_t m_actualnumberOfuttsPerMinibatch;
+ size_t m_mbSize;
+ vector m_toProcess;
+ vector m_switchFrame;
+ bool m_noData;
+
+ bool m_trainOrTest; // if false, in file writing mode
+
+ std::map m_idToLabelMap;
+
+ bool m_partialMinibatch; // allow partial minibatches?
+
+ std::vector m_featuresBufferMultiUtt;
+ std::vector m_featuresBufferAllocatedMultiUtt;
+ std::vector m_labelsBufferMultiUtt;
+ std::vector m_labelsBufferAllocatedMultiUtt;
+ std::vector m_featuresStartIndexMultiUtt;
+ std::vector m_labelsStartIndexMultiUtt;
+
+ std::vector m_featuresBufferMultiIO;
+ std::vector m_featuresBufferAllocatedMultiIO;
+ std::vector m_labelsBufferMultiIO;
+ std::vector m_labelsBufferAllocatedMultiIO;
+
+ std::map m_featureNameToIdMap;
+ std::map m_labelNameToIdMap;
+ std::map m_nameToTypeMap;
+ std::map m_featureNameToDimMap;
+ std::map m_labelNameToDimMap;
+ // for writing outputs to files (standard single input/output network) - deprecate eventually
+ bool m_checkDictionaryKeys;
+ bool m_convertLabelsToTargets;
+ std::vector m_convertLabelsToTargetsMultiIO;
+ std::vector> m_inputFilesMultiIO;
+
+ size_t m_inputFileIndex;
+ std::vector m_featDims;
+ std::vector m_labelDims;
+
+ std::vector>>m_labelToTargetMapMultiIO;
+
+ void PrepareForTrainingOrTesting(const ConfigParameters& config);
+ void PrepareForWriting(const ConfigParameters& config);
+
+ bool GetMinibatchToTrainOrTest(std::map*>&matrices);
+ bool GetMinibatchToWrite(std::map*>&matrices);
+
+ void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+ void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+
+ bool ReNewBufferForMultiIO(size_t i);
+
+ size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
+ void SetNbrSlicesEachRecurrentIter(const size_t) { };
+
+ void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels);
+
+
+ size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap);
+ enum InputOutputTypes
+ {
+ real,
+ category,
+ };
+
+
+
+public:
+ virtual void Init(const ConfigParameters& config);
+ virtual void Destroy() {delete this;}
+ virtual ~HTKMLFReader();
+ virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+ virtual bool GetMinibatch(std::map*>& matrices);
+ virtual const std::map& GetLabelMapping(const std::wstring& sectionName);
+ virtual void SetLabelMapping(const std::wstring& sectionName, const std::map& labelMapping);
+ virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+
+ virtual bool DataEnd(EndDataType endDataType);
+ void SetSentenceEndInBatch(vector &/*sentenceEnd*/);
+ void SetSentenceEnd(int /*actualMbSize*/){};
+};
+
+>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
}}}
\ No newline at end of file
diff --git a/DataReader/HTKMLFReader/rollingwindowsource.h b/DataReader/HTKMLFReader/rollingwindowsource.h
index a3babcb13..7d5e253cc 100644
--- a/DataReader/HTKMLFReader/rollingwindowsource.h
+++ b/DataReader/HTKMLFReader/rollingwindowsource.h
@@ -1,817 +1,817 @@
-//
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file
-//
-
-#pragma once
-
-#include "basetypes.h" // for attempt()
-#include "numahelpers.h" // for NUMA allocation
-#include "minibatchsourcehelpers.h"
-#include "minibatchiterator.h"
-#include "biggrowablevectors.h"
-#include "ssematrix.h"
-
-namespace msra { namespace dbn {
-
- // ---------------------------------------------------------------------------
- // biggrowablevectorarray -- a big array of vectors for features, growable (push_back)
- // Data is striped across NUMA nodes, as to not clog them up.
- // This also supports paging to disk, which is used for the old minibatchframesource.
- // ---------------------------------------------------------------------------
- class biggrowablevectorarray : public growablevectorbase
- {
- size_t m; // dim
-
- size_t inmembegin; // range we have in memory, rounded to enclosing blocks (not rounded at end)
- size_t inmemend;
-
- wstring pagepath; // path for paging, empty if no paging
- auto_file_ptr f; // file handle for paging
- bool reading; // have we begun reading?
-
- // allocate a block
- msra::dbn::matrix * newblock() const
- {
- // we stripe the data across NUMA nodes as to not fill up one node with the feature data
- msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode());
- msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock);
- msra::numa::overridenode (-1); // note: we really should reset it also in case of failure
- return res;
- }
-
- // handling of page file
- bool paging() const { return !pagepath.empty(); }
- void openpagefile (bool wantread)
- {
- if (!paging()) return;
- msra::files::make_intermediate_dirs (pagepath);
-
- if (!wantread)
- {
- FILE *ftry = NULL;
- wstring pathname (pagepath);
- ftry = _wfopen (pathname.c_str(), L"wbS");
- if (ftry) fclose (ftry);
- }
-
- /*
- code below to cycle through a-z appended to file name is no longer necessary
- since caller guarantees unique file names via HTKMLFReader
- and we want the pagepath logged to the user to be the actual one used by the code
-
- // try to open the pagepath from a to z
- if (!wantread)
- {
- FILE *ftry = NULL;
- char trynum = 'a';
- while (!ftry && trynum <= 'z')
- {
- wstring pathname (pagepath);
- pathname += trynum++;
- ftry = _wfopen (pathname.c_str(), L"wbS");
- }
- if (ftry) fclose (ftry);
- pagepath += --trynum;
- }
- */
- f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS");
- reading = wantread;
- }
- void flushlastblock() // during population phase, must be called once per block in sequence
- {
- if (!paging()) return;
- assert (!reading);
- if (blocks.empty()) return;
- const size_t blockid = blocks.size() -1;
- msra::dbn::matrix & block = *blocks[blockid];
- assert (fgetpos (f) == blockid * block.sizeinpagefile());
- block.topagefile (f);
- blocks[blockid].reset(); // free the memory
- assert (blockid * elementsperblock == inmembegin);
- inmembegin = inmemend; // empty range
- }
- void releaseblock (size_t t0) // t0=block start time
- {
- assert (paging() && reading);
- size_t blockid = t0 / elementsperblock;
- assert (blockid * elementsperblock == t0);
- assert (blocks[blockid]);
- fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
- blocks[blockid].reset(); // free the memory
- }
- void recoverblock (size_t t0) // t0=block start time
- {
- assert (paging() && reading);
- size_t blockid = t0 / elementsperblock;
- assert (blockid * elementsperblock == t0);
- assert (!blocks[blockid]);
- fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
- blocks[blockid].reset (newblock());
- msra::dbn::matrix & block = *blocks[blockid];
- fsetpos (f, blockid * block.sizeinpagefile());
- block.frompagefile (f);
- }
-
- public:
- biggrowablevectorarray (const wstring & pagepath)
- : growablevectorbase (65536), m (0),
- inmembegin (0), inmemend (0), pagepath (pagepath), reading (false)
- {
- openpagefile (false);
- if (paging())
- fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str());
- }
- ~biggrowablevectorarray() { // clean up the big temp file
- if (paging()) {
- fclose (f);
- if (_wunlink (pagepath.c_str())==0)
- fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
- else
- fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str());
- }
- }
-
- size_t dim() const { return m; } // dimension of a frame
-
- // reading phase
- void push_back (const std::vector & in)
- {
- assert (!in.empty());
- assert (m == 0 || m == in.size());
- m = in.size();
- const size_t blockid = n / elementsperblock;
- assert (blockid <= blocks.size());
- if (blockid == blocks.size()) // a new block is needed
- {
- flushlastblock();
- blocks.push_back (std::unique_ptr (newblock()));
- }
- const size_t blockn = n % elementsperblock;
- msra::dbn::matrix & block = *blocks[blockid].get();
- foreach_index (k, in)
- block(k,blockn) = in[k];
- n++;
- inmemend = n;
- }
- void no_more_push_back() // done pushing --switch to consumption mode
- {
- if (!paging()) return;
- // finish off last block
- flushlastblock();
- fflushOrDie (f);
- fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f));
- fclose (f);
- foreach_index (i, blocks) assert (!blocks[i]); // ensure we flushed
- assert (inmembegin == inmemend); // nothing in cache
- // switch to reading mode
- openpagefile (true);
- }
-
- // access phase
- // Returns 'true' if data was actually read from disk.
- bool require (pair bounds) // we require this range of frames
- {
- bool readfromdisk = false;
-
- // get bounds rounded to block boundaries
- const size_t ts = bounds.first / elementsperblock * elementsperblock;
- const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock);
- assert (paging());
- // free all the memmory
- for (size_t t = inmembegin; t < inmemend; t += elementsperblock)
- {
- if (t >= ts && t < te) // if in wanted range then skip to end of it
- t = te - elementsperblock;
- else
- releaseblock (t);
- }
- // page in all required blocks
- for (size_t t = ts; t < te; t += elementsperblock)
- {
- if (t >= inmembegin && t < inmemend) // if in memory already then skip to end of it
- t = inmemend - elementsperblock;
- else
- {
- recoverblock (t);
- readfromdisk = true; // tell caller we did something expensive
- }
- }
- // got it
- inmembegin = ts;
- inmemend = te;
- return readfromdisk;
- }
- const msra::dbn::matrixstripe operator[] (size_t t) const // get a feature vector
- {
- if (t < inmembegin || t >= inmemend)
- throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first");
- const size_t blockt = getblockt (t);
- /*const*/ msra::dbn::matrix & block = getblock (t);
- return msra::dbn::matrixstripe (block, blockt, 1);
- }
- wstring pagepathname(){ return pagepath;}
- void cleanuppagefile()
- {
- if (paging()) {
- fclose (f);
- if (_wunlink (pagepath.c_str())==0){
- fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
- }
- else{
- fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str());
- }
- }
- }
- };
-
- // ---------------------------------------------------------------------------
- // minibatchframesource -- feature source to provide randomized frames in minibatches
- // This is the old code that pages all frames to a huge disk file first.
- // (The new minibatchutterancesource pages from input files directly and can also
- // operate in utterance mode for MMI training.)
- // ---------------------------------------------------------------------------
- class minibatchframesource : public minibatchsource
- {
- size_t vdim; // feature dimension after augmenting neighhors (0: don't read features)
- unsigned int sampperiod; // (for reference and to check against model)
- string featkind;
- size_t featdim;
- // cache
- biggrowablevectorarray frames; // [t][i] all features concatenated
- std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
- std::vector classids; // [t] the state that the frame belongs to
- size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
- msra::dbn::randomordering randomordering; // [t] -> t'
- double timegetbatch;
- int verbosity;
- public:
- // constructor
- // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
- minibatchframesource (const std::vector & infiles, const map> & labels,
- size_t vdim, size_t udim, size_t randomizationrange, const wstring & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
- : vdim (vdim), sampperiod (0), featdim (0), numframes (0), frames (pagepath), timegetbatch (0), verbosity(2)
- {
- if (vdim == 0 && labels.empty())
- throw runtime_error ("minibatchframesource: when running without features, labels are needed");
- // at this stage, we simply page in the entire training set at once and work off RAM
- // We will benefit from feature archives indirectly through htkfeatio.
- // TODO:
- // - infiles must specify time range
- // - at this stage only reserve() (we know the time range; allocate second-layer structure)
- // - implement block-wise paging directly from HTK feature files through htkfeatreader
- featkind.clear();
- std::vector frame;
- fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size());
- size_t numclasses = 0; // number of units found (actually max id +1)
- size_t notfound = 0; // number of entries missing in MLF
- msra::asr::htkfeatreader reader; // feature reader
- reader.AddEnergy(addEnergy);
-
- foreach_index (i, infiles)
- {
- if (i % (infiles.size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
- msra::basetypes::matrix feat;
- msra::asr::htkfeatreader::parsedpath ppath (infiles[i]);
-
- // skip files for which labels don't exist (assuming bad alignment)
- wstring key;
- if (!labels.empty()) // empty means unsupervised mode (don't load any)
- {
- key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none)
- if (labels.find (key) == labels.end())
- {
- if (notfound < 5)
- fprintf (stderr, "\nminibatchframesource: %d-th file not found in MLF label set: %S", i, key.c_str());
- notfound++;
- continue; // skip this utterance at all
- }
- }
-
- // get feature frames
- if (vdim != 0) // (vdim == special mode to not read features at all)
- {
- msra::util::attempt (5, [&]()
- {
- reader.read (ppath, featkind, sampperiod, feat); // whole file read as columns of feature vectors
- });
- if (featdim == 0) // first time
- featdim = feat.rows();
- else if (featdim != feat.rows())
- throw std::runtime_error ("minibatchframesource: inconsistent feature dimension across files");
- // HVite occasionally generates mismatching output --skip such files
- if (!key.empty()) // (we have a key if supervised mode)
- {
- const auto & labseq = labels.find (key)->second; // (we already checked above that it exists)
- size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
- if (abs ((int) labframes - (int) feat.cols()) > 0)
- {
- fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
- notfound++;
- continue; // skip this utterance at all
- }
- }
- // append to cache
- frame.resize (featdim);
- if (feat.cols() < 2) // (2 frames needed for boundary markers)
- throw std::runtime_error ("minibatchframesource: utterances < 2 frames not supported");
- foreach_column (t, feat)
- {
- foreach_index (k, frame)
- frame[k] = feat(k,t);
- frames.push_back (frame);
- numframes++;
- boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
- }
- assert (numframes == frames.size());
- assert (numframes == boundaryflags.size());
- }
-
- // get label sequence
- if (!key.empty()) // (we have a key if supervised mode)
- {
- const auto & labseq = labels.find (key)->second; // (we already checked above that it exists)
- foreach_index (i, labseq)
- {
- const auto & e = labseq[i];
- if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
- throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
- for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
- {
- if (e.classid >= udim)
- throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: class id exceeds model dimension in file %S", key.c_str()));
- if (e.classid != (CLASSIDTYPE) e.classid)
- throw std::runtime_error ("CLASSIDTYPE has too few bits");
- classids.push_back ((CLASSIDTYPE) e.classid);
- numclasses = max (numclasses, 1u + e.classid);
- }
- }
- if (vdim == 0)
- numframes = classids.size();
- if (numframes != classids.size()) // TODO: remove this once we are confident
- throw std::runtime_error (msra::strfun::strprintf ("minibatchframesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
- assert (numframes == classids.size());
- }
- else
- {
- assert (classids.empty()); // that's how we detect it later
- }
- }
- assert (vdim == 0 || numframes == frames.size());
- assert (labels.empty() || numframes == classids.size());
- if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size()))
- throw std::runtime_error ("minibatchframesource: numframes variable screwup");
- fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses);
- if (notfound > 0)
- {
- fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size());
- if (notfound > infiles.size() / 2)
- throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n");
- }
-
- if (numframes == 0 && !mayhavenoframe)
- throw std::runtime_error ("minibatchframesource: no input features given!");
-
- // notify frames source to switch from population to consumption mode
- frames.no_more_push_back();
-
- // initialize randomizer
- if (numframes > 0)
- randomordering.resize (numframes, randomizationrange);
- }
- virtual ~minibatchframesource() {}
- size_t totalframes() const { assert (vdim == 0 || numframes == frames.size()); assert (!issupervised() || numframes == classids.size()); return numframes; }
-
- bool issupervised() const { return !classids.empty(); }
-
- void setverbosity(int newverbosity) { verbosity = newverbosity; }
-
- // retrieve one minibatch
- // Minibatches are deterministic pseudo-random samples. The entire corpus
- // is repeated infinitely, but each repetition (a 'sweep') is randomized
- // differently.
- // This function allows to retrieve a mini-batch starting from any frame
- // within this infinitely extended repetition. To the end, mini-batches are
- // specified by start frame and #frames.
- // This function returns the same data independent on #frames, i.e. the concept
- // of the mini-batch is not defined in here, but on the caller side. The caller
- // can retrieve the frames of a mini-batch in chunks that do not match the
- // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
- // If a requested mini-batch spans a sweep boundary, then this function will
- // not return samples after the sweep boundary. Instead, the returned frame
- // set is shortened to not exceed the end of the sweep. The caller must make
- // a separate second call to get the rest. In trainlayer(), the one
- // sweep-boundary-spanning mini-batch will simply be shortened.
- // This function is NOT thread-safe (due to caching of random sequence).
- bool getbatch (const size_t globalts, const size_t framesrequested, msra::dbn::matrix & feat, std::vector & uids,
- std::vector> & transcripts,
- std::vector> & latticepairs)
- {
- auto_timer timergetbatch;
-
- transcripts.clear(); // word-level transcripts not supported by frame source (aimed at MMI)
- latticepairs.clear(); // neither are lattices
-
- assert (totalframes() > 0);
- const size_t sweep = globalts / totalframes(); // which sweep (this determines randomization)
- const size_t ts = globalts % totalframes(); // start frame within the sweep
- const size_t te = min (ts + framesrequested, totalframes()); // do not go beyond sweep boundary
- assert (te > ts);
- if (verbosity >= 2)
- fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
-
- // get random sequence (each time index occurs exactly once)
- // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
- const auto & tmap = randomordering (sweep);
-
- // page in the needed range of frames
- const size_t extent = augmentationextent (frames.dim(), vdim);
- bool readfromdisk = frames.require (randomordering.bounds (max (ts, extent) - extent, te + 1 + extent));
-
- // generate features and uids
- feat.resize (vdim, te - ts); // note: special mode vdim == 0 means no features to be loaded
- if (issupervised()) // empty means unsupervised training -> return empty uids
- uids.resize (te - ts);
- else
- uids.clear();
- for (size_t t = ts; t < te; t++)
- {
- size_t trand = tmap[t]; // the random-sequence sample point for this point in time
- if (vdim != 0)
- {
- auto v_t = feat.col(t-ts); // the vector to fill in
- augmentneighbors (frames, boundaryflags, trand, v_t);
- }
- if (issupervised())
- uids[t-ts] = classids[trand];
- }
- timegetbatch = timergetbatch;
- return readfromdisk;
- }
-
- bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids,
- std::vector> & transcripts,
- std::vector> & latticepairs)
- {
- // for single input/output set size to be 1 and run old getbatch
- feat.resize(1);
- uids.resize(1);
- //transcripts.resize(1);
- //latticepairs.resize(1);
- return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
- }
-
- double gettimegetbatch () { return timegetbatch;}
-
- // return first valid globalts to ask getbatch() for
- // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
- /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
-
- /*implement*/ const std::vector & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); static std::vector x; return x;/*keep compiler happy*/ }
- };
-
- // ---------------------------------------------------------------------------
- // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches
- // this is derived from minibatchframesource but worked with multiple inputs and/or outputs
- // by making "frames" and "classids" a vector of vectors
- // ---------------------------------------------------------------------------
- class minibatchframesourcemulti : public minibatchsource
- {
- std::vector vdim; // feature dimension after augmenting neighhors (0: don't read features)
- std::vector leftcontext; // number of frames to the left of the target frame in the context window
- std::vector rightcontext; // number of frames to the right of the target frame in the context window
- unsigned int sampperiod; // (for reference and to check against model)
- string featkind;
- size_t featdim;
- size_t maxvdim;
- // cache
- //std::vector frames;
- std::vector> pframes; // [t][i] all features concatenated
- std::vector boundaryflags; // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
- std::vector> classids; // [t] the state that the frame belongs to
- size_t numframes; // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
- msra::dbn::randomordering randomordering; // [t] -> t'
- double timegetbatch;
- int verbosity;
-
- public:
- // constructor
- // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
- minibatchframesourcemulti (const std::vector> & infiles, const std::vector