CNTK/Source/Common/File.cpp

967 строки
32 KiB
C++
Исходник Обычный вид История

2014-08-30 03:21:42 +04:00
//
2016-01-18 11:36:17 +03:00
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
2014-08-30 03:21:42 +04:00
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#include "Basics.h"
2014-08-30 03:21:42 +04:00
#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
#include "File.h"
#include <string>
#include <stdint.h>
#include <locale>
#ifdef _WIN32
#define NOMINMAX
#include "Windows.h"
2016-03-23 17:32:19 +03:00
#include <VersionHelpers.h>
#include <Shlwapi.h>
#pragma comment(lib, "Shlwapi.lib")
#endif
#ifdef __unix__
#include <unistd.h>
2016-03-06 08:40:02 +03:00
#include <linux/limits.h> // for PATH_MAX
#endif
2014-08-30 03:21:42 +04:00
#define PCLOSE_ERROR -1
#define WRITE_BUFFER_SIZE (1024 * 1024)
namespace Microsoft { namespace MSR { namespace CNTK {
2014-08-30 03:21:42 +04:00
// File creation
// filename - the path
// fileOptions - options to open the file
File::File(const std::wstring& filename, int fileOptions)
{
Init(filename.c_str(), fileOptions);
}
File::File(const std::string& filename, int fileOptions)
{
// this converts from string to wstring, and then to wchar_t*
Init(msra::strfun::utf16(filename).c_str(), fileOptions);
}
File::File(const wchar_t* filename, int fileOptions)
{
Init(filename, fileOptions);
}
template<class String>
static bool IsNonFilePath(const String& filename)
{
return
filename.front() == '|' || // "| command": output pipe
filename.back() == '|' || // "command |": input pipe
(filename.size() == 1 && filename[0] == '-'); // "-": stdin/stdout
}
// test if a file exists
// If the pathname is a pipe, it is considered to exist.
template<class String>
/*static*/ bool File::Exists(const String& filename)
{
return IsNonFilePath(filename) || fexists(filename);
}
template /*static*/ bool File::Exists<string> (const string& filename);
template /*static*/ bool File::Exists<wstring>(const wstring& filename);
template<class String>
/*static*/ void File::MakeIntermediateDirs(const String& filename)
{
if (!IsNonFilePath(filename))
msra::files::make_intermediate_dirs(filename);
}
//template /*static*/ void File::MakeIntermediateDirs<string> (const string& filename); // implement this if needed
template /*static*/ void File::MakeIntermediateDirs<wstring>(const wstring& filename);
// all constructors call this
2014-08-30 03:21:42 +04:00
void File::Init(const wchar_t* filename, int fileOptions)
{
m_filename = filename;
m_options = fileOptions;
if (m_filename.empty())
RuntimeError("File: filename is empty");
const auto outputPipe = (m_filename.front() == '|');
const auto inputPipe = (m_filename.back() == '|');
2014-08-30 03:21:42 +04:00
// translate the options string into a string for fopen()
const auto reading = !!(fileOptions & fileOptionsRead);
const auto writing = !!(fileOptions & fileOptionsWrite);
if (!reading && !writing)
RuntimeError("File: either fileOptionsRead or fileOptionsWrite must be specified");
// convert fileOptions to fopen()'s mode string
wstring options = reading ? L"r" : L"";
if (writing)
2014-08-30 03:21:42 +04:00
{
// if we already are reading the file, change to read/write
options.clear();
options.append(L"w");
if (!outputPipe && m_filename != L"-")
{
options.append(L"+");
msra::files::make_intermediate_dirs(m_filename.c_str()); // writing to regular file -> also create the intermediate directories as a convenience
}
2014-08-30 03:21:42 +04:00
}
if (fileOptions & fileOptionsBinary)
2014-08-30 03:21:42 +04:00
options += L"b";
else
options += L"t";
2014-08-30 03:21:42 +04:00
// add sequential flag to allocate big read buffer
if (fileOptions & fileOptionsSequential)
options += L"S";
// now open the file
// Special path syntax understood here:
// - "-" refers to stdin or stdout
// - "|cmd" writes to a pipe
// - "cmd|" reads from a pipe
m_pcloseNeeded = false;
m_seekable = false;
if (m_filename == L"-") // stdin/stdout
{
if (writing && reading)
2015-07-03 04:53:44 +03:00
RuntimeError("File: cannot specify fileOptionsRead and fileOptionsWrite at once with path '-'");
m_file = writing ? stdout : stdin;
}
else if (outputPipe || inputPipe) // pipe syntax
{
if (inputPipe && outputPipe)
RuntimeError("File: pipes cannot specify fileOptionsRead and fileOptionsWrite at once");
if (inputPipe != reading)
RuntimeError("File: pipes must use consistent fileOptionsRead/fileOptionsWrite");
const auto command = inputPipe ? m_filename.substr(0, m_filename.size() - 1) : m_filename.substr(1);
m_file = _wpopen(command.c_str(), options.c_str());
if (!m_file)
RuntimeError("File: error exexuting pipe command '%S': %s", command.c_str(), strerror(errno));
m_pcloseNeeded = true;
}
else
attempt([=]() // regular file: use a retry loop
{
m_file = fopenOrDie(filename, options.c_str());
m_seekable = true;
});
2014-08-30 03:21:42 +04:00
}
// determine the directory for a given pathname
// (wstring only for now; feel free to make this a template if needed)
/*static*/ wstring File::DirectoryPathOf(wstring path)
{
2016-04-16 03:26:04 +03:00
#ifdef _WIN32
2016-04-18 20:22:29 +03:00
// Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
// TODO:
// "PathCchCanonicalize does the / to \ conversion as a part of the canonicalization, it's
2016-04-18 20:22:29 +03:00
// probably a good idea to do that anyway since I suspect that the '..' characters might
// confuse the other PathCch functions" [Larry Osterman]
// "Consider GetFullPathName both for canonicalization and last element finding." [Jay Krell]
path = msra::strfun::ReplaceAll<wstring>(path, L"/", L"\\");
HRESULT hr;
if (IsWindows8OrGreater()) // PathCchRemoveFileSpec() only available on Windows 8+
2016-03-23 17:32:19 +03:00
{
typedef HRESULT(*PathCchRemoveFileSpecProc)(_Inout_updates_(_Inexpressible_(cchPath)) PWSTR, _In_ size_t);
HINSTANCE hinstLib = LoadLibrary(TEXT("api-ms-win-core-path-l1-1-0.dll"));
if (hinstLib == nullptr)
RuntimeError("DirectoryPathOf: LoadLibrary() unexpectedly failed.");
PathCchRemoveFileSpecProc PathCchRemoveFileSpec = reinterpret_cast<PathCchRemoveFileSpecProc>(GetProcAddress(hinstLib, "PathCchRemoveFileSpec"));
if (!PathCchRemoveFileSpec)
RuntimeError("DirectoryPathOf: GetProcAddress() unexpectedly failed.");
2016-03-23 17:32:19 +03:00
// this is the actual function call we care about
hr = PathCchRemoveFileSpec(&path[0], path.size());
2016-03-23 17:32:19 +03:00
FreeLibrary(hinstLib);
2016-03-23 17:32:19 +03:00
}
else // on Windows 7-, use older PathRemoveFileSpec() instead
hr = PathRemoveFileSpec(&path[0]) ? S_OK : S_FALSE;
if (hr == S_OK) // done
path.resize(wcslen(&path[0]));
else if (hr == S_FALSE) // nothing to remove: use .
path = L".";
else
RuntimeError("DirectoryPathOf: Path(Cch)RemoveFileSpec() unexpectedly failed with 0x%08x.", (unsigned int)hr);
#else
auto pos = path.find_last_of(L"/");
if (pos != path.npos)
path.erase(pos);
else // if no directory path at all, use current directory
return L".";
#endif
return path;
}
// determine the file name for a given pathname
// (wstring only for now; feel free to make this a template if needed)
/*static*/ wstring File::FileNameOf(wstring path)
{
#ifdef WIN32
static const wstring delim = L"\\:/";
#else
static const wstring delim = L"/";
#endif
auto pos = path.find_last_of(delim);
if (pos != path.npos)
return path.substr(pos + 1);
else // no directory path
return path;
}
// get path of current executable
/*static*/ wstring File::GetExecutablePath()
{
#ifdef WIN32
wchar_t path[33000];
if (GetModuleFileNameW(NULL, path, _countof(path)) == 0)
LogicError("GetExecutablePath: GetModuleFileNameW() unexpectedly failed.");
return path;
#else
// from http://stackoverflow.com/questions/4025370/can-an-executable-discover-its-own-path-linux
pid_t pid = getpid();
2016-03-06 08:40:02 +03:00
char path[PATH_MAX + 1] = { 0 };
sprintf(path, "/proc/%d/exe", pid);
2016-03-06 08:40:02 +03:00
char dest[PATH_MAX + 1] = { 0 };
if (readlink(path, dest, PATH_MAX) == -1)
RuntimeError("GetExecutableDirectory: readlink() call failed.");
else
return msra::strfun::utf16(dest);
#endif
}
// skip to given delimiter character
void File::SkipToDelimiter(int delim)
2014-08-30 03:21:42 +04:00
{
int ch = 0;
2014-08-30 03:21:42 +04:00
while (ch != delim)
{
ch = fgetc(m_file);
if (feof(m_file))
{
2014-08-30 03:21:42 +04:00
printf("Unexpected end of file\n");
LogicError("Unexpected end of file\n");
2014-08-30 03:21:42 +04:00
}
}
}
bool File::IsTextBased()
{
return !!(m_options & fileOptionsText);
2014-08-30 03:21:42 +04:00
}
// File Destructor
// closes the file
2016-03-05 00:50:59 +03:00
// Note: this does not check for errors when the File corresponds to pipe stream. In this case, use Flush() before closing a file you are writing.
2014-08-30 03:21:42 +04:00
File::~File(void)
{
int rc = 0;
if (m_pcloseNeeded)
{
rc = _pclose(m_file);
if ((rc == PCLOSE_ERROR) && !std::uncaught_exception())
{
RuntimeError("File: failed to close file at %S", m_filename.c_str());
}
}
else if (m_file != stdin && m_file != stdout && m_file != stderr)
{
rc = fclose(m_file);
if ((rc != FCLOSE_SUCCESS) && !std::uncaught_exception())
{
RuntimeError("File: failed to close file at %S", m_filename.c_str());
}
}
}
void File::Flush()
{
fflushOrDie(m_file);
2014-08-30 03:21:42 +04:00
}
// read a line
// End of line is denoted by one of these, i.e. we don't support the old Mac OS convention of CR
// - LF
// - CR+LF
// - EOF
static bool fgetc(char& c, FILE * f) { int ci = getc(f); c = (char) ci; return ci != EOF; }
static inline bool BeginsWithUnicodeBOM(const char * s)
2014-08-30 03:21:42 +04:00
{
return ((unsigned char)s[0] == 0xEF && (unsigned char)s[1] == 0xBB && (unsigned char)s[2] == 0xBF);
}
// read a 8-bit string until newline is hit
template<class STRING>
static void fgets(STRING & s, FILE * f)
{
s.resize(0);
char c;
while (fgetc(c, f))
{
if (c == '\n' || c == '\r')
{
if (c == '\r' && (!fgetc(c, f) || c != '\n'))
RuntimeError("fgets: malformed text file, CR without LF");
break;
}
s.push_back(c);
// strip Unicode BOM
// We strip it from any string, not just at the start.
// This allows to UNIX-'cat' multiple UTF-8 files with BOMs.
// Since the BOM is otherwise invalid within a file, this is well-defined and upwards compatible.
if (s.size() == 3 && BeginsWithUnicodeBOM(s.c_str()))
s.clear();
}
2014-08-30 03:21:42 +04:00
}
// GetLine - get a line from the file
// str - string
2014-08-30 03:21:42 +04:00
void File::GetLine(string& str)
{
fgets(str, m_file);
2014-08-30 03:21:42 +04:00
}
static void PushBackString(vector<string>& lines, const string& s) { lines.push_back(s); }
static void PushBackString(vector<wstring>& lines, string& s) { lines.push_back(msra::strfun::utf16(s)); }
2015-09-14 23:18:44 +03:00
// GetLines - get all lines from a file
template <typename STRING>
static void FileGetLines(File& file, /*out*/ std::vector<STRING>& lines)
2015-09-14 23:18:44 +03:00
{
lines.clear();
string line;
2015-09-14 23:18:44 +03:00
while (!file.IsEOF())
{
file.GetLine(line);
PushBackString(lines, line);
2015-09-14 23:18:44 +03:00
}
}
void File::GetLines(std::vector<std::wstring>& lines)
{
FileGetLines(*this, lines);
};
void File::GetLines(std::vector<std::string>& lines)
{
FileGetLines(*this, lines);
}
2015-09-14 23:18:44 +03:00
2014-08-30 03:21:42 +04:00
// Put a zero/space terminated wstring into a file
// val - value to write to the file
File& File::operator<<(const std::wstring& val)
{
WriteString(val.c_str());
return *this;
}
// Put a zero/space terminated string into a file
// val - value to write to the file
File& File::operator<<(const std::string& val)
{
WriteString(val.c_str());
return *this;
}
// Put a marker in the file, the marker depends on the file type
// marker - marker to place in the file
File& File::operator<<(FileMarker marker)
{
File& file = *this;
switch (marker)
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile: // beginning of file marker
// TODO: why not write a BOM?
2014-08-30 03:21:42 +04:00
break;
case fileMarkerEndFile: // end of file marker
// use ^Z for end of file for text files
// TODO: What??
if (m_options & fileOptionsText)
2014-08-30 03:21:42 +04:00
file << char(26);
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written for either
2014-08-30 03:21:42 +04:00
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (m_options & fileOptionsText)
2014-08-30 03:21:42 +04:00
file.WriteString("\r\n");
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
assert(false); // sections should use a string modifier
2014-08-30 03:21:42 +04:00
break;
}
return file;
}
// PutMarker for beginning of list support (lists with a count)
// count - [in] the number of elements in the list
File& File::PutMarker(FileMarker marker, size_t count)
{
assert(marker == fileMarkerBeginList);
marker; // only beginning of list supported for count markers
2014-08-30 03:21:42 +04:00
*this << count;
return *this;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File& File::PutMarker(FileMarker marker, const std::string& section)
{
File& file = *this;
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
file << section;
return file;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File& File::PutMarker(FileMarker marker, const std::wstring& section)
{
File& file = *this;
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
file << section;
return file;
}
// Get a zero terminated wstring from a file
// val - value to read from the file
File& File::operator>>(std::wstring& val)
{
if (IsTextBased())
val = fgetwtoken(m_file);
else
val = fgetwstring(m_file);
2014-08-30 03:21:42 +04:00
return *this;
}
// Get a zero terminated string from a file
// val - value to read from the file
File& File::operator>>(std::string& val)
{
if (IsTextBased())
val = fgettoken(m_file);
else
val = fgetstring(m_file);
2014-08-30 03:21:42 +04:00
return *this;
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File::ReadChars(std::string& val, size_t cnt, bool reset)
{
size_t pos = 0; // (initialize to keep compiler happy)
2014-08-30 03:21:42 +04:00
if (reset)
pos = GetPosition();
val.resize(cnt);
char* str = const_cast<char*>(val.c_str());
for (int i = 0; i < cnt; ++i)
2014-08-30 03:21:42 +04:00
*this >> str[i];
if (reset)
SetPosition(pos);
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
{
size_t pos = 0; // (initialize to keep compiler happy)
2014-08-30 03:21:42 +04:00
if (reset)
pos = GetPosition();
val.resize(cnt);
wchar_t* str = const_cast<wchar_t*>(val.c_str());
for (int i = 0; i < cnt; ++i)
2014-08-30 03:21:42 +04:00
*this >> str[i];
if (reset)
SetPosition(pos);
}
// WriteString - outputs a string into the file
// str - the string to output
// size - size of the string to output, if zero null terminated
void File::WriteString(const char* str, int size)
{
if (size > 0)
{
fwprintf(m_file, L" %.*hs", size, str);
}
else
{
if (IsTextBased())
fwprintf(m_file, L" %hs", str);
else
fputstring(m_file, str);
}
2014-08-30 03:21:42 +04:00
}
// ReadString - reads a string into the file
// str - the string buffer to read the string into
// size - size of the string buffer incl. zero terminator (we fail if input is too long)
2014-08-30 03:21:42 +04:00
void File::ReadString(char* str, int size)
{
if (IsTextBased())
{
fgettoken(m_file, str, size);
if (BeginsWithUnicodeBOM(str))
for (; str[3]; str++)
str[0] = str[3]; // delete it from start of line
}
else
fgetstring(m_file, str, size);
2014-08-30 03:21:42 +04:00
}
// WriteString - outputs a string into the file
// if writing to text based file and spaces are embedded, writes quotes around string
// BUGBUG: This should be consistent between char and wchar_t versions
2014-08-30 03:21:42 +04:00
// str - the string to output
// size - size of the string to output, if zero null terminated
void File::WriteString(const wchar_t* str, int size)
{
#ifdef EMBEDDED_SPACES
// start of implementation of embedded space support with quoting
// not complete, not sure if we need it
bool spacefound = false;
wchar_t quote = 0;
if (IsTextBased())
{
// search for embedded spaces and quotes
wstring searchString = L" \"'~";
const wchar_t* result = NULL;
while (result = wcspbrk(str, searchString.c_str()))
{
if (IsWhiteSpace(*result))
spacefound = true;
searchString.find(*result, 0);
}
}
2014-08-30 03:21:42 +04:00
#endif
if (size > 0)
{
fwprintf(m_file, L" %.*ls", size, str);
}
else
{
if (IsTextBased())
fwprintf(m_file, L" %ls", str);
else
fputstring(m_file, str);
}
2014-08-30 03:21:42 +04:00
}
// ReadString - reads a string from the file
2014-08-30 03:21:42 +04:00
// str - the string buffer to read the string into
// size - size of the string string buffer
void File::ReadString(wchar_t* str, int size)
{
if (IsTextBased())
fgettoken(m_file, str, size);
else
fgetstring(m_file, str, size);
2014-08-30 03:21:42 +04:00
}
// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
// skip - skip the BOM mark if found (defaults to false)
// returns - true if on a unicode BOM
bool File::IsUnicodeBOM(bool skip)
{
File& file = *this;
uint64_t pos = GetPosition(); // Note: This is where we will fail for non-seekable streams.
2014-08-30 03:21:42 +04:00
// if we aren't at the beginning of the file, it can't be the byte order mark
if (pos != 0)
return false;
// only exists for UNICODE files
bool found = false;
if (m_options & fileOptionsText)
2014-08-30 03:21:42 +04:00
{
char val[3] = { 0 };
for (size_t i = 0; i < _countof(val) && !file.IsEOF(); i++)
val[i] = (char) getc(m_file);
found = BeginsWithUnicodeBOM(val);
2014-08-30 03:21:42 +04:00
}
// restore pointer if no BOM or we aren't skipping it
if (!found || !skip)
{
SetPosition(pos);
}
return found;
}
//Size - return the size of the file
// WARNING: calling this will reset the EOF marker, so do so with care
size_t File::Size()
{
if (!CanSeek())
RuntimeError("File: attempted to get Size() on non-seekable stream");
2014-08-30 03:21:42 +04:00
return filesize(m_file);
}
// IsEOF - if we have read past the end of the file
// return - true if end of file has been found
bool File::IsEOF()
{
return !!feof(m_file);
}
// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
// skip - skip the whitespace if found (defaults to false)
// returns - true if whitespace found
// TODO: This function actually consumes the white-space characters. Document that behavior.
2014-08-30 03:21:42 +04:00
bool File::IsWhiteSpace(bool skip)
{
bool spaceFound = false;
bool spaceCur = false;
int c;
do
2014-08-30 03:21:42 +04:00
{
c = fgetc(m_file);
if (c == EOF) // hit the end
return spaceFound;
spaceCur = !!isspace(c);
spaceFound = spaceFound || spaceCur;
} while (spaceCur && skip);
// put back the last character (EOF is ignored)
ungetc(c, m_file);
2014-08-30 03:21:42 +04:00
return spaceFound;
}
// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
// skip - skip the end of line if found (defaults to false)
// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
int File::EndOfLineOrEOF(bool skip)
{
if (IsTextBased())
return fskipNewline(m_file, skip);
else
return false;
2014-08-30 03:21:42 +04:00
}
// Buffer write stream
int File::Setvbuf()
{
return setvbuf(this->m_file, NULL, _IOFBF, WRITE_BUFFER_SIZE);
}
2014-08-30 03:21:42 +04:00
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
File& File::operator>>(FileMarker marker)
{
File& file = *this;
switch (marker)
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile: // beginning of file marker
// check for Unicode BOM marker
if (IsTextBased() && CanSeek()) // files from a pipe cannot begin with Unicode BOM, sorry
2014-08-30 03:21:42 +04:00
IsUnicodeBOM(true);
break;
case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
if (!IsEOF())
RuntimeError("fileMarkerEndFile not found");
2014-08-30 03:21:42 +04:00
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written unless an list with a count header
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (IsTextBased())
{
int found = EndOfLineOrEOF(true);
if (found != (int) true) // EOF can also be returned
RuntimeError("Newline not found");
2014-08-30 03:21:42 +04:00
}
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
assert(false); // sections should use a string modifier
2014-08-30 03:21:42 +04:00
break;
}
return file;
}
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
// This function will fail for non-seekable streams.
2014-08-30 03:21:42 +04:00
bool File::IsMarker(FileMarker marker, bool skip)
{
bool retval = false;
switch (marker)
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile: // beginning of file marker
// check for Unicode BOM marker
retval = IsUnicodeBOM(skip);
break;
case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
retval = IsEOF();
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written unless an list with a count header
// should we try to validate BOL header (just know it's an int, not negative, etc.)
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (IsTextBased())
{
int eolSeen = false;
eolSeen = EndOfLineOrEOF(skip);
retval = (eolSeen == (int) true);
2014-08-30 03:21:42 +04:00
}
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
2014-08-30 03:21:42 +04:00
// can't destinquish from a string currently
break;
}
return retval;
}
// GetMarker for beginning of list support (lists with a count)
// count - [out] returns the number of elements in the list
File& File::GetMarker(FileMarker marker, size_t& count)
{
assert(marker == fileMarkerBeginList);
marker; // only beginning of list supported for count file markers
2014-08-30 03:21:42 +04:00
// use text based try, so it can fail without an exception
if (IsTextBased())
ftrygetText(m_file, count);
else
fget(m_file, count);
return *this;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File& File::GetMarker(FileMarker marker, const std::string& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
string str;
*this >> str;
if (str != section)
RuntimeError("section name mismatch %s != %s", str.c_str(), section.c_str());
2014-08-30 03:21:42 +04:00
return *this;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File& File::GetMarker(FileMarker marker, const std::wstring& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
wstring str;
*this >> str;
if (str != section)
RuntimeError("section name mismatch %ls != %ls", str.c_str(), section.c_str());
2014-08-30 03:21:42 +04:00
return *this;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
size_t pos = GetPosition();
std::wstring str;
try
{
*this >> str;
if (str == section)
return true;
}
catch (...)
2014-08-30 03:21:42 +04:00
{
// eat
2014-08-30 03:21:42 +04:00
}
SetPosition(pos);
return false;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File::TryGetMarker(FileMarker marker, const std::string& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection);
marker;
2014-08-30 03:21:42 +04:00
size_t pos = GetPosition();
std::string str;
try
{
*this >> str;
if (str == section)
return true;
}
catch (...)
2014-08-30 03:21:42 +04:00
{
return false;
}
SetPosition(pos);
return false;
}
// GetPosition - Get position in a file
uint64_t File::GetPosition()
2014-08-30 03:21:42 +04:00
{
if (!CanSeek())
RuntimeError("File: attempted to GetPosition() on non-seekable stream");
2014-08-30 03:21:42 +04:00
return fgetpos(m_file);
}
// Set the position in the file
// pos - position in the file
void File::SetPosition(uint64_t pos)
2014-08-30 03:21:42 +04:00
{
if (!CanSeek())
RuntimeError("File: attempted to SetPosition() on non-seekable stream");
fsetpos(m_file, pos);
2014-08-30 03:21:42 +04:00
}
// helper to load a matrix from a stream (file or string literal)
// The input string is expected to contain one line per matrix row (natural printing order for humans).
// Inputs:
// - getLineFn: a lambda that fills a string with the next input line (=next matrix row)
// The lambda returns an empty string to denote the end.
// Outputs:
// - numRows, numCols: matrix dimensions inferred from newlines
// - array: matrix values in column-major order (ready for SetValue())
template<class ElemType, class F>
static void LoadMatrixFromLambda(const F& getLineFn, const wstring& locationForMsg, vector<ElemType>& array, size_t& /*out*/ numRows, size_t& /*out*/ numCols)
{
// load matrix into vector of vectors (since we don't know the size in advance)
vector<ElemType> vec;
std::vector<std::vector<ElemType>> elements;
size_t numColsInFirstRow = 0;
std::string line;
for(;;)
{
// get next input line
getLineFn(line);
if (line.empty())
break;
// tokenize and parse
vec.clear();
const char * p = line.c_str();
for (;;)
{
while (isspace((unsigned char)*p))
p++;
if (!*p)
break;
char* ep; // will be set to point to first character that failed parsing
double value = strtod(p, &ep);
if (*ep != 0 && !isspace((unsigned char)*ep))
RuntimeError("LoadMatrixFromTextFile: Malformed number '%.15s...' in row %d of %ls", p, (int)elements.size(), locationForMsg.c_str());
p = ep;
vec.push_back((ElemType)value);
}
size_t numElementsInRow = vec.size();
if (elements.empty())
numColsInFirstRow = numElementsInRow;
else if (numElementsInRow != numColsInFirstRow)
RuntimeError("Row %d has column dimension %d, inconsistent with previous dimension %d: %ls", (int)elements.size(), (int)numElementsInRow, (int)numColsInFirstRow, locationForMsg.c_str());
elements.push_back(vec);
}
numRows = elements.size();
numCols = numColsInFirstRow;
// Perform transpose when copying elements from vectors to ElemType[],
// in order to store in column-major format.
array.resize(numRows * numCols);
for (int i = 0; i < numCols; i++)
for (int j = 0; j < numRows; j++)
array[i * numRows + j] = elements[j][i];
}
// Load matrix from file. The file is a simple text file consisting of one line per matrix row, where each line contains the elements of the row separated by white space.
template <class ElemType>
/*static*/ vector<ElemType> File::LoadMatrixFromTextFile(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols)
{
File myfile(filePath, FileOptions::fileOptionsText | FileOptions::fileOptionsRead);
// LoadMatrixFromLambda() reads its input lines from the following lambda
// return the next input line, or empty string when the end is reached
auto getLineFn = [&](string& line)
{
while (!myfile.IsEOF())
{
myfile.GetLine(line);
if (!line.empty())
return; // got the next line to return
// End of file manifests as an empty line at the end.
// Also, we allow empty lines within the file, as that may help to visually structure matrices that really are >2D tensors.
}
line.clear(); // empty line indicates end of file
};
vector<ElemType> array;
LoadMatrixFromLambda(getLineFn, filePath, array, numRows, numCols);
return array;
}
// Load matrix from file. The file is a simple text file consisting of one line per matrix row, where each line contains the elements of the row separated by white space.
template <class ElemType>
/*static*/ vector<ElemType> File::LoadMatrixFromStringLiteral(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols)
{
// LoadMatrixFromLambda() reads its input lines from the following lambda
// return the next input line, or empty string when the end is reached
size_t pos = 0; // cursor for traversing the string. The lambda takes this by reference and modifies it.
auto getLineFn = [&](string& line)
{
// find first non-blank character of line
pos = literal.find_first_not_of(" \r\n", pos); // skip previous line end and any leading spaces
if (pos == string::npos)
return line.clear(); // hit the end: return empty line
// find end of line
auto endPos = literal.find_first_of("\r\n", pos + 1); // find line end
if (endPos == string::npos)
endPos = literal.size(); // no LF required at very end, so that it looks pretty in BS source code
line = literal.substr(pos, endPos - pos);
pos = endPos; // and advance cursor (we position it on the LF, which is skipped in next round)
return;
};
vector<ElemType> array;
LoadMatrixFromLambda(getLineFn, L"string literal", array, numRows, numCols);
return array;
}
template vector<float> File::LoadMatrixFromTextFile<float> (const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<double> File::LoadMatrixFromTextFile<double>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<float> File::LoadMatrixFromStringLiteral<float> (const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<double> File::LoadMatrixFromStringLiteral<double>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
}}}