CNTK/Common/File.cpp

710 строки
22 KiB
C++

//
// <copyright file="File.cpp" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#include "Basics.h"
#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
#include "fileutil.h"
#include "File.h"
#include <string>
#include <stdint.h>
#include <locale>
#ifdef _WIN32
#include <Windows.h>
#endif
#ifdef __unix__
#include <unistd.h>
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
// File creation
// filename - the path
// fileOptions - options to open the file
File::File(const std::wstring& filename, int fileOptions)
{
Init(filename.c_str(), fileOptions);
}
File::File(const std::string& filename, int fileOptions)
{
// this converts from string to wstring, and then to wchar_t*
Init(msra::strfun::utf16(filename).c_str(), fileOptions);
}
File::File(const wchar_t* filename, int fileOptions)
{
Init(filename, fileOptions);
}
// all constructors call this
void File::Init(const wchar_t* filename, int fileOptions)
{
m_filename = filename;
m_options = fileOptions;
if (m_filename.empty())
RuntimeError("File: filename is empty");
const auto outputPipe = (m_filename.front() == '|');
const auto inputPipe = (m_filename.back() == '|');
// translate the options string into a string for fopen()
const auto reading = !!(fileOptions & fileOptionsRead);
const auto writing = !!(fileOptions & fileOptionsWrite);
if (!reading && !writing)
RuntimeError("File: either fileOptionsRead or fileOptionsWrite must be specified");
// convert fileOptions to fopen()'s mode string
wstring options = reading ? L"r" : L"";
if (writing)
{
// if we already are reading the file, change to read/write
options.clear();
options.append(L"w");
if (!outputPipe && m_filename != L"-")
{
options.append(L"+");
msra::files::make_intermediate_dirs(m_filename.c_str()); // writing to regular file -> also create the intermediate directories as a convenience
}
}
if (fileOptions&fileOptionsBinary)
{
options += L"b";
}
else
{
if (fileOptions & fileOptionsUnicode)
options += L"b";
else
options += L"t";
// I attempted to use the translated characterset modes, but encountered strange errors
//options += L"t, ccs=";
//options += (fileOptions & fileOptionsUnicode)?L"UNICODE":L"UTF-8";
}
// add sequential flag to allocate big read buffer
if (fileOptions & fileOptionsSequential)
options += L"S";
// now open the file
// Special path syntax understood here:
// - "-" refers to stdin or stdout
// - "|cmd" writes to a pipe
// - "cmd|" reads from a pipe
m_pcloseNeeded = false;
m_seekable = false;
if (m_filename == L"-") // stdin/stdout
{
if (writing && reading)
RuntimeError("File: cannot specify fileOptionsRead and fileOptionsWrite at once with path '-'");
m_file = writing ? stdout : stdin;
}
else if (outputPipe || inputPipe) // pipe syntax
{
if (inputPipe && outputPipe)
RuntimeError("File: pipes cannot specify fileOptionsRead and fileOptionsWrite at once");
if (inputPipe != reading)
RuntimeError("File: pipes must use consistent fileOptionsRead/fileOptionsWrite");
const auto command = inputPipe ? m_filename.substr(0, m_filename.size() - 1) : m_filename.substr(1);
m_file = _wpopen(command.c_str(), options.c_str());
if (!m_file)
RuntimeError("File: error exexuting pipe command '%S': %s", command.c_str(), strerror(errno));
m_pcloseNeeded = true;
}
else attempt([=]() // regular file: use a retry loop
{
m_file = fopenOrDie(filename, options.c_str());
m_seekable = true;
});
}
// skip to given delimiter character
void File::SkipToDelimiter(int delim)
{
int ch=0;
while (ch!=delim) {
ch=fgetc(m_file);
if (feof(m_file)) {
printf("Unexpected end of file\n");
LogicError("Unexpected end of file\n");
}
}
}
bool File::IsTextBased()
{
return !!(m_options & (fileOptionsText|fileOptionsUnicode));
}
// File Destructor
// closes the file
// Note: this does not check for errors. Use Flush() before closing a file you are writing.
File::~File(void)
{
if (m_pcloseNeeded)
_pclose(m_file);
else if (m_file != stdin && m_file != stdout && m_file != stderr)
fclose(m_file); // (since destructors may not throw, we ignore the return code here)
}
void File::Flush()
{
fflushOrDie(m_file);
}
// GetLine - get a line from the file
// str - string to store the line
void File::GetLine(wstring& str)
{
str = fgetlinew(m_file);
}
// GetLine - get a line from the file
// str - string
void File::GetLine(string& str)
{
str = fgetline(m_file);
}
// GetLines - get all lines from a file
template<typename STRING> static void FileGetLines(File & file, std::vector<STRING>& lines)
{
STRING line;
while (!file.IsEOF())
{
file.GetLine(line);
lines.push_back(line);
}
}
void File::GetLines(std::vector<std::wstring>& lines) { FileGetLines(*this, lines); };
void File::GetLines(std::vector<std::string>& lines) { FileGetLines(*this, lines); }
// Put a zero/space terminated wstring into a file
// val - value to write to the file
File& File::operator<<(const std::wstring& val)
{
WriteString(val.c_str());
return *this;
}
// Put a zero/space terminated string into a file
// val - value to write to the file
File& File::operator<<(const std::string& val)
{
WriteString(val.c_str());
return *this;
}
// Put a marker in the file, the marker depends on the file type
// marker - marker to place in the file
File& File::operator<<(FileMarker marker)
{
File& file = *this;
switch(marker)
{
case fileMarkerBeginFile: // beginning of file marker
// only exists for UNICODE files
if (m_options & fileOptionsUnicode)
file << (unsigned int)0xfeff; // byte order mark
break;
case fileMarkerEndFile: // end of file marker
// use ^Z for end of file for text files
if (m_options & fileOptionsUnicode)
file << wchar_t(26); // ^Z
else if (m_options & fileOptionsText)
file << char(26);
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written for either
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (m_options & fileOptionsUnicode)
file.WriteString(L"\r\n"); // carriage return/life feed
else if (m_options & fileOptionsText)
file.WriteString("\r\n");
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
assert(false); // sections should use a string modifier
break;
}
return file;
}
// PutMarker for beginning of list support (lists with a count)
// count - [in] the number of elements in the list
File& File::PutMarker(FileMarker marker, size_t count)
{
assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count markers
*this << count;
return *this;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File& File::PutMarker(FileMarker marker, const std::string& section)
{
File& file = *this;
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
file << section;
return file;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File& File::PutMarker(FileMarker marker, const std::wstring& section)
{
File& file = *this;
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
file << section;
return file;
}
// Get a zero terminated wstring from a file
// val - value to read from the file
File& File::operator>>(std::wstring& val)
{
attempt([&]
{
if (IsTextBased())
val = fgetwtoken(m_file);
else
val = fgetwstring(m_file);
});
return *this;
}
// Get a zero terminated string from a file
// val - value to read from the file
File& File::operator>>(std::string& val)
{
attempt([&]
{
if (IsTextBased())
val = fgettoken(m_file);
else
val = fgetstring(m_file);
});
return *this;
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File::ReadChars(std::string& val, size_t cnt, bool reset)
{
size_t pos = 0; // (initialize to keep compiler happy)
if (reset)
pos = GetPosition();
val.resize(cnt);
char *str = const_cast<char *>(val.c_str());
for (int i=0;i < cnt;++i)
*this >> str[i];
if (reset)
SetPosition(pos);
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File::ReadChars(std::wstring& val, size_t cnt, bool reset)
{
size_t pos = 0; // (initialize to keep compiler happy)
if (reset)
pos = GetPosition();
val.resize(cnt);
wchar_t *str = const_cast<wchar_t *>(val.c_str());
for (int i=0;i < cnt;++i)
*this >> str[i];
if (reset)
SetPosition(pos);
}
// WriteString - outputs a string into the file
// str - the string to output
// size - size of the string to output, if zero null terminated
void File::WriteString(const char* str, int size)
{
attempt([&]{
if (size > 0)
{
fwprintf(m_file, L" %.*hs", size, str);
}
else
{
if (IsTextBased())
fwprintf(m_file, L" %hs", str);
else
fputstring (m_file, str);
}
});
}
// ReadString - reads a string into the file
// str - the string buffer to read the string into
// size - size of the string string buffer
void File::ReadString(char* str, int size)
{
attempt([&]{
if (IsTextBased())
fgettoken(m_file, str, size);
else
fgetstring (m_file, str, size);
});
}
// WriteString - outputs a string into the file
// if writing to text based file and spaces are embedded, writes quotes around string
// str - the string to output
// size - size of the string to output, if zero null terminated
void File::WriteString(const wchar_t* str, int size)
{
attempt([&]{
#ifdef EMBEDDED_SPACES
// start of implementation of embedded space support with quoting
// not complete, not sure if we need it
bool spacefound = false;
wchar_t quote = 0;
if (IsTextBased())
{
// search for embedded spaces and quotes
wstring searchString = L" \"'~";
const wchar_t* result = NULL;
while (result = wcspbrk(str, searchString.c_str()))
{
if (IsWhiteSpace(*result))
spacefound = true;
searchString.find(*result, 0);
}
}
#endif
if (size > 0)
{
fwprintf(m_file, L" %.*ls", size, str);
}
else
{
if (IsTextBased())
fwprintf(m_file, L" %ls", str);
else
fputstring (m_file, str);
}
});
}
// ReadString - reads a string into the file
// str - the string buffer to read the string into
// size - size of the string string buffer
void File::ReadString(wchar_t* str, int size)
{
attempt([&]
{
if (IsTextBased())
fgettoken(m_file, str, size);
else
fgetstring (m_file, str, size);
});
}
// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
// skip - skip the BOM mark if found (defaults to false)
// returns - true if on a unicode BOM
bool File::IsUnicodeBOM(bool skip)
{
File& file = *this;
uint64_t pos = GetPosition();
// if we aren't at the beginning of the file, it can't be the byte order mark
if (pos != 0)
return false;
// only exists for UNICODE files
bool found = false;
if (m_options & fileOptionsUnicode)
{
unsigned int bom=0;
if (IsTextBased())
ftrygetText(m_file, bom);
else
fget(m_file, bom);
// future: one reason for the BOM is to detect other-endian files, should we support?
found = (bom == 0xfeff);
}
else if (m_options & fileOptionsText)
{
char val[3];
file.ReadString(val, 3);
found = (val[0] == 0xEF && val[1] == 0xBB && val[2] == 0xBF);
}
// restore pointer if no BOM or we aren't skipping it
if (!found || !skip)
{
SetPosition(pos);
}
return found;
}
//Size - return the size of the file
// WARNING: calling this will reset the EOF marker, so do so with care
size_t File::Size()
{
if (!CanSeek())
RuntimeError("File: attempted to get Size() on non-seekable stream");
return filesize(m_file);
}
// IsEOF - if we have read past the end of the file
// return - true if end of file has been found
bool File::IsEOF()
{
return !!feof(m_file);
}
// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
// skip - skip the whitespace if found (defaults to false)
// returns - true if whitespace found
bool File::IsWhiteSpace(bool skip)
{
bool spaceFound = false;
bool spaceCur = false;
if (m_options & fileOptionsUnicode)
{
wint_t c;
do
{
c = fgetwc (m_file);
if (c == WEOF) // hit the end
return spaceFound;
spaceCur = !!iswspace(c);
spaceFound = spaceFound || spaceCur;
} while (spaceCur && skip);
// put back the last character (WEOF is ignored)
ungetwc(c, m_file);
}
else
{
int c;
do
{
c = fgetc (m_file);
if (c == EOF) // hit the end
return spaceFound;
spaceCur = !!isspace(c);
spaceFound = spaceFound || spaceCur;
} while (spaceCur && skip);
// put back the last character (EOF is ignored)
ungetc(c, m_file);
}
return spaceFound;
}
// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
// skip - skip the end of line if found (defaults to false)
// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
int File::EndOfLineOrEOF(bool skip)
{
int found = false;
if (m_options & fileOptionsUnicode)
found = fskipwNewline(m_file,skip);
else if (m_options & fileOptionsText)
found = fskipNewline(m_file, skip);
return found;
}
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
File& File::operator>>(FileMarker marker)
{
File& file = *this;
switch(marker)
{
case fileMarkerBeginFile: // beginning of file marker
// check for Unicode BOM marker
if (IsTextBased())
IsUnicodeBOM(true);
break;
case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
if (!IsEOF())
RuntimeError("fileMarkerEndFile not found");
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written unless an list with a count header
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (IsTextBased())
{
int found = EndOfLineOrEOF(true);
if (found != (int)true) // EOF can also be returned
RuntimeError("Newline not found");
}
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
assert(false); // sections should use a string modifier
break;
}
return file;
}
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
bool File::IsMarker(FileMarker marker, bool skip)
{
bool retval = false;
switch(marker)
{
case fileMarkerBeginFile: // beginning of file marker
// check for Unicode BOM marker
retval = IsUnicodeBOM(skip);
break;
case fileMarkerEndFile: // end of file marker, should we throw if it's not the end of the file?
retval = IsEOF();
break;
case fileMarkerBeginList: // Beginning of list marker
// no marker written unless an list with a count header
// should we try to validate BOL header (just know it's an int, not negative, etc.)
break;
case fileMarkerListSeparator: // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break;
case fileMarkerEndList: // end of line/list marker
if (IsTextBased())
{
int eolSeen = false;
eolSeen = EndOfLineOrEOF(skip);
retval = (eolSeen == (int)true);
}
break;
case fileMarkerBeginSection: // beginning of section
case fileMarkerEndSection: // end of section
// can't destinquish from a string currently
break;
}
return retval;
}
// GetMarker for beginning of list support (lists with a count)
// count - [out] returns the number of elements in the list
File& File::GetMarker(FileMarker marker, size_t& count)
{
assert(marker == fileMarkerBeginList); marker; // only beginning of list supported for count file markers
// use text based try, so it can fail without an exception
if (IsTextBased())
ftrygetText(m_file, count);
else
fget(m_file, count);
return *this;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File& File::GetMarker(FileMarker marker, const std::string& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
string str;
*this >> str;
if (str != section)
RuntimeError(std::string("section name mismatch ") + str + " != " + section);
return *this;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File& File::GetMarker(FileMarker marker, const std::wstring& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
wstring str;
*this >> str;
if (str != section)
RuntimeError(std::string("section name mismatch ") + msra::strfun::utf8(str) + " != " + msra::strfun::utf8(section));
return *this;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File::TryGetMarker(FileMarker marker, const std::wstring& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
size_t pos = GetPosition();
std::wstring str;
try
{
*this >> str;
if (str == section)
return true;
}
catch(...)
{
//eat
}
SetPosition(pos);
return false;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File::TryGetMarker(FileMarker marker, const std::string& section)
{
// only the section markers take a string parameter
assert(marker == fileMarkerBeginSection || marker == fileMarkerEndSection); marker;
size_t pos = GetPosition();
std::string str;
try
{
*this >> str;
if (str == section)
return true;
}
catch(...)
{
return false;
}
SetPosition(pos);
return false;
}
// GetPosition - Get position in a file
uint64_t File::GetPosition()
{
if (!CanSeek())
RuntimeError("File: attempted to GetPosition() on non-seekable stream");
return fgetpos(m_file);
}
// Set the position in the file
// pos - position in the file
void File::SetPosition(uint64_t pos)
{
if (!CanSeek())
RuntimeError("File: attempted to SetPosition() on non-seekable stream");
fsetpos(m_file, pos);
}
}}}