2014-08-30 03:21:42 +04:00
//
2016-01-18 11:36:17 +03:00
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
2014-08-30 03:21:42 +04:00
//
2014-10-23 20:16:51 +04:00
2015-02-06 03:19:55 +03:00
# ifndef _CRT_SECURE_NO_WARNINGS
2014-10-23 22:55:53 +04:00
# define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
2015-02-06 03:19:55 +03:00
# endif
2016-01-18 11:36:14 +03:00
# define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
2014-10-23 20:16:51 +04:00
2015-05-19 05:36:04 +03:00
# include "Basics.h"
2014-08-30 03:21:42 +04:00
# define FORMAT_SPECIALIZE // to get the specialized version of the format routines
# include "File.h"
2014-10-24 00:14:19 +04:00
# include <string>
# include <stdint.h>
2014-10-31 02:28:43 +03:00
# include <locale>
2014-10-31 00:41:33 +03:00
# ifdef _WIN32
2016-01-22 21:45:53 +03:00
# define NOMINMAX
# include "Windows.h"
2016-03-23 17:32:19 +03:00
# include <VersionHelpers.h>
# include <Shlwapi.h>
# pragma comment(lib, "Shlwapi.lib")
2014-10-31 00:41:33 +03:00
# endif
# ifdef __unix__
# include <unistd.h>
2016-03-06 08:40:02 +03:00
# include <linux/limits.h> // for PATH_MAX
2014-10-31 00:41:33 +03:00
# endif
2014-08-30 03:21:42 +04:00
2016-09-28 07:08:51 +03:00
# define PCLOSE_ERROR -1
2016-11-07 22:40:00 +03:00
# define WRITE_BUFFER_SIZE (1024 * 1024)
2016-09-28 07:08:51 +03:00
2015-06-30 05:24:18 +03:00
namespace Microsoft { namespace MSR { namespace CNTK {
2014-08-30 03:21:42 +04:00
// File creation
// filename - the path
// fileOptions - options to open the file
File : : File ( const std : : wstring & filename , int fileOptions )
{
Init ( filename . c_str ( ) , fileOptions ) ;
}
File : : File ( const std : : string & filename , int fileOptions )
{
// this converts from string to wstring, and then to wchar_t*
Init ( msra : : strfun : : utf16 ( filename ) . c_str ( ) , fileOptions ) ;
}
File : : File ( const wchar_t * filename , int fileOptions )
{
Init ( filename , fileOptions ) ;
}
2016-02-15 08:24:59 +03:00
template < class String >
static bool IsNonFilePath ( const String & filename )
{
return
filename . front ( ) = = ' | ' | | // "| command": output pipe
filename . back ( ) = = ' | ' | | // "command |": input pipe
( filename . size ( ) = = 1 & & filename [ 0 ] = = ' - ' ) ; // "-": stdin/stdout
}
2016-02-15 07:28:08 +03:00
// test if a file exists
// If the pathname is a pipe, it is considered to exist.
template < class String >
2016-02-15 08:24:59 +03:00
/*static*/ bool File : : Exists ( const String & filename )
2016-02-15 07:28:08 +03:00
{
2016-02-15 08:24:59 +03:00
return IsNonFilePath ( filename ) | | fexists ( filename ) ;
2016-02-15 07:28:08 +03:00
}
2016-02-15 08:24:59 +03:00
template /*static*/ bool File : : Exists < string > ( const string & filename ) ;
2016-02-15 07:28:08 +03:00
template /*static*/ bool File : : Exists < wstring > ( const wstring & filename ) ;
2016-02-15 08:24:59 +03:00
template < class String >
/*static*/ void File : : MakeIntermediateDirs ( const String & filename )
{
if ( ! IsNonFilePath ( filename ) )
msra : : files : : make_intermediate_dirs ( filename ) ;
}
//template /*static*/ void File::MakeIntermediateDirs<string> (const string& filename); // implement this if needed
template /*static*/ void File : : MakeIntermediateDirs < wstring > ( const wstring & filename ) ;
2015-06-30 05:24:18 +03:00
// all constructors call this
2014-08-30 03:21:42 +04:00
void File : : Init ( const wchar_t * filename , int fileOptions )
{
2015-06-30 05:24:18 +03:00
m_filename = filename ;
m_options = fileOptions ;
if ( m_filename . empty ( ) )
RuntimeError ( " File: filename is empty " ) ;
const auto outputPipe = ( m_filename . front ( ) = = ' | ' ) ;
2016-02-15 07:28:08 +03:00
const auto inputPipe = ( m_filename . back ( ) = = ' | ' ) ;
2014-08-30 03:21:42 +04:00
// translate the options string into a string for fopen()
2015-06-30 05:24:18 +03:00
const auto reading = ! ! ( fileOptions & fileOptionsRead ) ;
const auto writing = ! ! ( fileOptions & fileOptionsWrite ) ;
if ( ! reading & & ! writing )
RuntimeError ( " File: either fileOptionsRead or fileOptionsWrite must be specified " ) ;
// convert fileOptions to fopen()'s mode string
wstring options = reading ? L " r " : L " " ;
if ( writing )
2014-08-30 03:21:42 +04:00
{
// if we already are reading the file, change to read/write
options . clear ( ) ;
2015-06-30 05:24:18 +03:00
options . append ( L " w " ) ;
if ( ! outputPipe & & m_filename ! = L " - " )
{
options . append ( L " + " ) ;
2016-01-18 11:36:14 +03:00
msra : : files : : make_intermediate_dirs ( m_filename . c_str ( ) ) ; // writing to regular file -> also create the intermediate directories as a convenience
2015-06-30 05:24:18 +03:00
}
2014-08-30 03:21:42 +04:00
}
2016-01-18 11:36:14 +03:00
if ( fileOptions & fileOptionsBinary )
2014-08-30 03:21:42 +04:00
options + = L " b " ;
else
2016-02-15 07:28:08 +03:00
options + = L " t " ;
2014-08-30 03:21:42 +04:00
// add sequential flag to allocate big read buffer
if ( fileOptions & fileOptionsSequential )
options + = L " S " ;
2015-06-30 05:24:18 +03:00
// now open the file
// Special path syntax understood here:
// - "-" refers to stdin or stdout
// - "|cmd" writes to a pipe
// - "cmd|" reads from a pipe
m_pcloseNeeded = false ;
m_seekable = false ;
2016-01-18 11:36:14 +03:00
if ( m_filename = = L " - " ) // stdin/stdout
2015-06-30 05:24:18 +03:00
{
if ( writing & & reading )
2015-07-03 04:53:44 +03:00
RuntimeError ( " File: cannot specify fileOptionsRead and fileOptionsWrite at once with path '-' " ) ;
2015-06-30 05:24:18 +03:00
m_file = writing ? stdout : stdin ;
}
2016-01-18 11:36:14 +03:00
else if ( outputPipe | | inputPipe ) // pipe syntax
2015-06-30 05:24:18 +03:00
{
if ( inputPipe & & outputPipe )
RuntimeError ( " File: pipes cannot specify fileOptionsRead and fileOptionsWrite at once " ) ;
if ( inputPipe ! = reading )
RuntimeError ( " File: pipes must use consistent fileOptionsRead/fileOptionsWrite " ) ;
const auto command = inputPipe ? m_filename . substr ( 0 , m_filename . size ( ) - 1 ) : m_filename . substr ( 1 ) ;
m_file = _wpopen ( command . c_str ( ) , options . c_str ( ) ) ;
if ( ! m_file )
2016-01-18 11:35:54 +03:00
RuntimeError ( " File: error exexuting pipe command '%S': %s " , command . c_str ( ) , strerror ( errno ) ) ;
2015-06-30 05:24:18 +03:00
m_pcloseNeeded = true ;
}
2016-01-18 11:36:14 +03:00
else
attempt ( [ = ] ( ) // regular file: use a retry loop
{
m_file = fopenOrDie ( filename , options . c_str ( ) ) ;
m_seekable = true ;
} ) ;
2014-08-30 03:21:42 +04:00
}
2016-03-06 08:27:04 +03:00
// determine the directory for a given pathname
// (wstring only for now; feel free to make this a template if needed)
/*static*/ wstring File : : DirectoryPathOf ( wstring path )
{
2016-04-16 03:26:04 +03:00
# ifdef _WIN32
2016-04-18 20:22:29 +03:00
// Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
// TODO:
2016-05-19 12:02:48 +03:00
// "PathCchCanonicalize does the / to \ conversion as a part of the canonicalization, it's
2016-04-18 20:22:29 +03:00
// probably a good idea to do that anyway since I suspect that the '..' characters might
// confuse the other PathCch functions" [Larry Osterman]
// "Consider GetFullPathName both for canonicalization and last element finding." [Jay Krell]
path = msra : : strfun : : ReplaceAll < wstring > ( path , L " / " , L " \\ " ) ;
2016-04-09 21:08:32 +03:00
HRESULT hr ;
if ( IsWindows8OrGreater ( ) ) // PathCchRemoveFileSpec() only available on Windows 8+
2016-03-23 17:32:19 +03:00
{
typedef HRESULT ( * PathCchRemoveFileSpecProc ) ( _Inout_updates_ ( _Inexpressible_ ( cchPath ) ) PWSTR , _In_ size_t ) ;
2016-04-09 21:08:32 +03:00
HINSTANCE hinstLib = LoadLibrary ( TEXT ( " api-ms-win-core-path-l1-1-0.dll " ) ) ;
if ( hinstLib = = nullptr )
RuntimeError ( " DirectoryPathOf: LoadLibrary() unexpectedly failed. " ) ;
PathCchRemoveFileSpecProc PathCchRemoveFileSpec = reinterpret_cast < PathCchRemoveFileSpecProc > ( GetProcAddress ( hinstLib , " PathCchRemoveFileSpec " ) ) ;
if ( ! PathCchRemoveFileSpec )
RuntimeError ( " DirectoryPathOf: GetProcAddress() unexpectedly failed. " ) ;
2016-03-23 17:32:19 +03:00
2016-04-09 21:08:32 +03:00
// this is the actual function call we care about
hr = PathCchRemoveFileSpec ( & path [ 0 ] , path . size ( ) ) ;
2016-03-23 17:32:19 +03:00
2016-04-09 21:08:32 +03:00
FreeLibrary ( hinstLib ) ;
2016-03-23 17:32:19 +03:00
}
2016-04-09 21:08:32 +03:00
else // on Windows 7-, use older PathRemoveFileSpec() instead
2016-06-17 20:31:07 +03:00
hr = PathRemoveFileSpec ( & path [ 0 ] ) ? S_OK : S_FALSE ;
2016-04-09 21:08:32 +03:00
2016-06-17 20:31:07 +03:00
if ( hr = = S_OK ) // done
path . resize ( wcslen ( & path [ 0 ] ) ) ;
else if ( hr = = S_FALSE ) // nothing to remove: use .
path = L " . " ;
else
2016-04-09 21:08:32 +03:00
RuntimeError ( " DirectoryPathOf: Path(Cch)RemoveFileSpec() unexpectedly failed with 0x%08x. " , ( unsigned int ) hr ) ;
2016-03-06 08:27:04 +03:00
# else
auto pos = path . find_last_of ( L " / " ) ;
if ( pos ! = path . npos )
2016-03-09 19:41:07 +03:00
path . erase ( pos ) ;
2016-03-06 08:27:04 +03:00
else // if no directory path at all, use current directory
return L " . " ;
# endif
return path ;
}
// determine the file name for a given pathname
// (wstring only for now; feel free to make this a template if needed)
/*static*/ wstring File : : FileNameOf ( wstring path )
{
2016-04-07 19:52:36 +03:00
# ifdef WIN32
2016-03-06 08:27:04 +03:00
static const wstring delim = L " \\ :/ " ;
# else
static const wstring delim = L " / " ;
# endif
auto pos = path . find_last_of ( delim ) ;
if ( pos ! = path . npos )
return path . substr ( pos + 1 ) ;
else // no directory path
return path ;
}
// get path of current executable
/*static*/ wstring File : : GetExecutablePath ( )
{
2016-04-07 19:52:36 +03:00
# ifdef WIN32
2016-03-06 08:27:04 +03:00
wchar_t path [ 33000 ] ;
if ( GetModuleFileNameW ( NULL , path , _countof ( path ) ) = = 0 )
LogicError ( " GetExecutablePath: GetModuleFileNameW() unexpectedly failed. " ) ;
return path ;
# else
// from http://stackoverflow.com/questions/4025370/can-an-executable-discover-its-own-path-linux
pid_t pid = getpid ( ) ;
2016-03-06 08:40:02 +03:00
char path [ PATH_MAX + 1 ] = { 0 } ;
2016-03-06 08:27:04 +03:00
sprintf ( path , " /proc/%d/exe " , pid ) ;
2016-03-06 08:40:02 +03:00
char dest [ PATH_MAX + 1 ] = { 0 } ;
2016-03-06 08:27:04 +03:00
if ( readlink ( path , dest , PATH_MAX ) = = - 1 )
RuntimeError ( " GetExecutableDirectory: readlink() call failed. " ) ;
else
return msra : : strfun : : utf16 ( dest ) ;
# endif
}
2015-06-30 05:24:18 +03:00
// skip to given delimiter character
void File : : SkipToDelimiter ( int delim )
2014-08-30 03:21:42 +04:00
{
2016-01-18 11:36:14 +03:00
int ch = 0 ;
2014-08-30 03:21:42 +04:00
2016-01-18 11:36:14 +03:00
while ( ch ! = delim )
{
ch = fgetc ( m_file ) ;
if ( feof ( m_file ) )
{
2014-08-30 03:21:42 +04:00
printf ( " Unexpected end of file \n " ) ;
2015-06-30 05:24:18 +03:00
LogicError ( " Unexpected end of file \n " ) ;
2014-08-30 03:21:42 +04:00
}
}
}
bool File : : IsTextBased ( )
{
2016-02-15 07:28:08 +03:00
return ! ! ( m_options & fileOptionsText ) ;
2014-08-30 03:21:42 +04:00
}
// File Destructor
// closes the file
2016-03-05 00:50:59 +03:00
// Note: this does not check for errors when the File corresponds to pipe stream. In this case, use Flush() before closing a file you are writing.
2014-08-30 03:21:42 +04:00
File : : ~ File ( void )
{
2016-09-28 07:08:51 +03:00
int rc = 0 ;
2015-06-30 05:24:18 +03:00
if ( m_pcloseNeeded )
2016-03-04 22:48:21 +03:00
{
2016-09-28 07:08:51 +03:00
rc = _pclose ( m_file ) ;
if ( ( rc = = PCLOSE_ERROR ) & & ! std : : uncaught_exception ( ) )
{
RuntimeError ( " File: failed to close file at %S " , m_filename . c_str ( ) ) ;
}
2016-03-04 22:48:21 +03:00
}
2015-06-30 05:24:18 +03:00
else if ( m_file ! = stdin & & m_file ! = stdout & & m_file ! = stderr )
2016-03-04 22:48:21 +03:00
{
2016-09-28 07:08:51 +03:00
rc = fclose ( m_file ) ;
if ( ( rc ! = FCLOSE_SUCCESS ) & & ! std : : uncaught_exception ( ) )
{
2016-03-04 22:48:21 +03:00
RuntimeError ( " File: failed to close file at %S " , m_filename . c_str ( ) ) ;
2016-09-28 07:08:51 +03:00
}
2016-03-04 22:48:21 +03:00
}
2015-06-30 05:24:18 +03:00
}
void File : : Flush ( )
{
fflushOrDie ( m_file ) ;
2014-08-30 03:21:42 +04:00
}
2016-02-15 07:28:08 +03:00
// read a line
// End of line is denoted by one of these, i.e. we don't support the old Mac OS convention of CR
// - LF
// - CR+LF
// - EOF
static bool fgetc ( char & c , FILE * f ) { int ci = getc ( f ) ; c = ( char ) ci ; return ci ! = EOF ; }
static inline bool BeginsWithUnicodeBOM ( const char * s )
2014-08-30 03:21:42 +04:00
{
2016-02-15 07:28:08 +03:00
return ( ( unsigned char ) s [ 0 ] = = 0xEF & & ( unsigned char ) s [ 1 ] = = 0xBB & & ( unsigned char ) s [ 2 ] = = 0xBF ) ;
}
// read a 8-bit string until newline is hit
template < class STRING >
static void fgets ( STRING & s , FILE * f )
{
s . resize ( 0 ) ;
char c ;
while ( fgetc ( c , f ) )
{
if ( c = = ' \n ' | | c = = ' \r ' )
{
if ( c = = ' \r ' & & ( ! fgetc ( c , f ) | | c ! = ' \n ' ) )
RuntimeError ( " fgets: malformed text file, CR without LF " ) ;
break ;
}
s . push_back ( c ) ;
// strip Unicode BOM
// We strip it from any string, not just at the start.
// This allows to UNIX-'cat' multiple UTF-8 files with BOMs.
// Since the BOM is otherwise invalid within a file, this is well-defined and upwards compatible.
if ( s . size ( ) = = 3 & & BeginsWithUnicodeBOM ( s . c_str ( ) ) )
s . clear ( ) ;
}
2014-08-30 03:21:42 +04:00
}
// GetLine - get a line from the file
2016-01-18 11:36:14 +03:00
// str - string
2014-08-30 03:21:42 +04:00
void File : : GetLine ( string & str )
{
2016-02-15 07:28:08 +03:00
fgets ( str , m_file ) ;
2014-08-30 03:21:42 +04:00
}
2016-02-15 07:28:08 +03:00
static void PushBackString ( vector < string > & lines , const string & s ) { lines . push_back ( s ) ; }
static void PushBackString ( vector < wstring > & lines , string & s ) { lines . push_back ( msra : : strfun : : utf16 ( s ) ) ; }
2015-09-14 23:18:44 +03:00
// GetLines - get all lines from a file
2016-01-18 11:36:14 +03:00
template < typename STRING >
2016-02-15 07:28:08 +03:00
static void FileGetLines ( File & file , /*out*/ std : : vector < STRING > & lines )
2015-09-14 23:18:44 +03:00
{
2016-02-15 07:28:08 +03:00
lines . clear ( ) ;
string line ;
2015-09-14 23:18:44 +03:00
while ( ! file . IsEOF ( ) )
{
file . GetLine ( line ) ;
2016-02-15 07:28:08 +03:00
PushBackString ( lines , line ) ;
2015-09-14 23:18:44 +03:00
}
}
2016-01-18 11:36:14 +03:00
void File : : GetLines ( std : : vector < std : : wstring > & lines )
{
FileGetLines ( * this , lines ) ;
} ;
void File : : GetLines ( std : : vector < std : : string > & lines )
{
FileGetLines ( * this , lines ) ;
}
2015-09-14 23:18:44 +03:00
2014-08-30 03:21:42 +04:00
// Put a zero/space terminated wstring into a file
// val - value to write to the file
File & File : : operator < < ( const std : : wstring & val )
{
WriteString ( val . c_str ( ) ) ;
return * this ;
}
// Put a zero/space terminated string into a file
// val - value to write to the file
File & File : : operator < < ( const std : : string & val )
{
WriteString ( val . c_str ( ) ) ;
return * this ;
}
// Put a marker in the file, the marker depends on the file type
// marker - marker to place in the file
File & File : : operator < < ( FileMarker marker )
{
File & file = * this ;
2016-01-18 11:36:14 +03:00
switch ( marker )
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile : // beginning of file marker
2016-02-15 07:28:08 +03:00
// TODO: why not write a BOM?
2014-08-30 03:21:42 +04:00
break ;
case fileMarkerEndFile : // end of file marker
// use ^Z for end of file for text files
2016-02-15 07:28:08 +03:00
// TODO: What??
if ( m_options & fileOptionsText )
2014-08-30 03:21:42 +04:00
file < < char ( 26 ) ;
break ;
case fileMarkerBeginList : // Beginning of list marker
2016-01-18 11:36:14 +03:00
// no marker written for either
2014-08-30 03:21:42 +04:00
break ;
case fileMarkerListSeparator : // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break ;
case fileMarkerEndList : // end of line/list marker
2016-02-15 07:28:08 +03:00
if ( m_options & fileOptionsText )
2014-08-30 03:21:42 +04:00
file . WriteString ( " \r \n " ) ;
break ;
case fileMarkerBeginSection : // beginning of section
2016-01-18 11:36:14 +03:00
case fileMarkerEndSection : // end of section
assert ( false ) ; // sections should use a string modifier
2014-08-30 03:21:42 +04:00
break ;
}
return file ;
}
// PutMarker for beginning of list support (lists with a count)
// count - [in] the number of elements in the list
File & File : : PutMarker ( FileMarker marker , size_t count )
{
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginList ) ;
marker ; // only beginning of list supported for count markers
2014-08-30 03:21:42 +04:00
* this < < count ;
return * this ;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File & File : : PutMarker ( FileMarker marker , const std : : string & section )
{
File & file = * this ;
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
file < < section ;
return file ;
}
// PutMarker for section beginning and ending tags
// section - [in]name of section
File & File : : PutMarker ( FileMarker marker , const std : : wstring & section )
{
File & file = * this ;
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
file < < section ;
return file ;
}
// Get a zero terminated wstring from a file
// val - value to read from the file
File & File : : operator > > ( std : : wstring & val )
{
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) )
val = fgetwtoken ( m_file ) ;
else
val = fgetwstring ( m_file ) ;
2014-08-30 03:21:42 +04:00
return * this ;
}
// Get a zero terminated string from a file
// val - value to read from the file
File & File : : operator > > ( std : : string & val )
{
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) )
val = fgettoken ( m_file ) ;
else
val = fgetstring ( m_file ) ;
2014-08-30 03:21:42 +04:00
return * this ;
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File : : ReadChars ( std : : string & val , size_t cnt , bool reset )
{
2014-10-13 22:36:46 +04:00
size_t pos = 0 ; // (initialize to keep compiler happy)
2014-08-30 03:21:42 +04:00
if ( reset )
pos = GetPosition ( ) ;
val . resize ( cnt ) ;
2016-01-18 11:36:14 +03:00
char * str = const_cast < char * > ( val . c_str ( ) ) ;
for ( int i = 0 ; i < cnt ; + + i )
2014-08-30 03:21:42 +04:00
* this > > str [ i ] ;
if ( reset )
SetPosition ( pos ) ;
}
// ReadChars - read a specified number of characters, and reset read pointer if requested
// val - [in,out] return value will be returned here
// cnt - number of characters to read
// reset - reset the read pointer
void File : : ReadChars ( std : : wstring & val , size_t cnt , bool reset )
{
2014-10-13 22:36:46 +04:00
size_t pos = 0 ; // (initialize to keep compiler happy)
2014-08-30 03:21:42 +04:00
if ( reset )
pos = GetPosition ( ) ;
val . resize ( cnt ) ;
2016-01-18 11:36:14 +03:00
wchar_t * str = const_cast < wchar_t * > ( val . c_str ( ) ) ;
for ( int i = 0 ; i < cnt ; + + i )
2014-08-30 03:21:42 +04:00
* this > > str [ i ] ;
if ( reset )
SetPosition ( pos ) ;
}
// WriteString - outputs a string into the file
// str - the string to output
// size - size of the string to output, if zero null terminated
void File : : WriteString ( const char * str , int size )
{
2016-02-15 07:28:08 +03:00
if ( size > 0 )
{
fwprintf ( m_file , L " %.*hs " , size , str ) ;
}
else
{
if ( IsTextBased ( ) )
fwprintf ( m_file , L " %hs " , str ) ;
else
fputstring ( m_file , str ) ;
}
2014-08-30 03:21:42 +04:00
}
// ReadString - reads a string into the file
// str - the string buffer to read the string into
2016-02-15 07:28:08 +03:00
// size - size of the string buffer incl. zero terminator (we fail if input is too long)
2014-08-30 03:21:42 +04:00
void File : : ReadString ( char * str , int size )
{
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) )
{
fgettoken ( m_file , str , size ) ;
if ( BeginsWithUnicodeBOM ( str ) )
for ( ; str [ 3 ] ; str + + )
str [ 0 ] = str [ 3 ] ; // delete it from start of line
}
else
fgetstring ( m_file , str , size ) ;
2014-08-30 03:21:42 +04:00
}
// WriteString - outputs a string into the file
// if writing to text based file and spaces are embedded, writes quotes around string
2016-02-15 07:28:08 +03:00
// BUGBUG: This should be consistent between char and wchar_t versions
2014-08-30 03:21:42 +04:00
// str - the string to output
// size - size of the string to output, if zero null terminated
void File : : WriteString ( const wchar_t * str , int size )
{
2016-01-18 11:36:14 +03:00
# ifdef EMBEDDED_SPACES
2016-02-15 07:28:08 +03:00
// start of implementation of embedded space support with quoting
// not complete, not sure if we need it
bool spacefound = false ;
wchar_t quote = 0 ;
if ( IsTextBased ( ) )
{
// search for embedded spaces and quotes
wstring searchString = L " \" '~ " ;
const wchar_t * result = NULL ;
while ( result = wcspbrk ( str , searchString . c_str ( ) ) )
{
if ( IsWhiteSpace ( * result ) )
spacefound = true ;
searchString . find ( * result , 0 ) ;
}
}
2014-08-30 03:21:42 +04:00
# endif
2016-02-15 07:28:08 +03:00
if ( size > 0 )
{
fwprintf ( m_file , L " %.*ls " , size , str ) ;
}
else
{
if ( IsTextBased ( ) )
fwprintf ( m_file , L " %ls " , str ) ;
else
fputstring ( m_file , str ) ;
}
2014-08-30 03:21:42 +04:00
}
2016-02-15 07:28:08 +03:00
// ReadString - reads a string from the file
2014-08-30 03:21:42 +04:00
// str - the string buffer to read the string into
// size - size of the string string buffer
void File : : ReadString ( wchar_t * str , int size )
{
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) )
fgettoken ( m_file , str , size ) ;
else
fgetstring ( m_file , str , size ) ;
2014-08-30 03:21:42 +04:00
}
// IsUnicodeBOM - is the next characters the Unicode Byte Order Mark?
// skip - skip the BOM mark if found (defaults to false)
// returns - true if on a unicode BOM
bool File : : IsUnicodeBOM ( bool skip )
{
File & file = * this ;
2016-02-15 07:28:08 +03:00
uint64_t pos = GetPosition ( ) ; // Note: This is where we will fail for non-seekable streams.
2014-08-30 03:21:42 +04:00
// if we aren't at the beginning of the file, it can't be the byte order mark
if ( pos ! = 0 )
return false ;
// only exists for UNICODE files
bool found = false ;
2016-02-15 07:28:08 +03:00
if ( m_options & fileOptionsText )
2014-08-30 03:21:42 +04:00
{
2016-02-15 07:28:08 +03:00
char val [ 3 ] = { 0 } ;
for ( size_t i = 0 ; i < _countof ( val ) & & ! file . IsEOF ( ) ; i + + )
val [ i ] = ( char ) getc ( m_file ) ;
found = BeginsWithUnicodeBOM ( val ) ;
2014-08-30 03:21:42 +04:00
}
// restore pointer if no BOM or we aren't skipping it
if ( ! found | | ! skip )
{
SetPosition ( pos ) ;
}
return found ;
}
//Size - return the size of the file
// WARNING: calling this will reset the EOF marker, so do so with care
size_t File : : Size ( )
{
2015-06-30 05:24:18 +03:00
if ( ! CanSeek ( ) )
RuntimeError ( " File: attempted to get Size() on non-seekable stream " ) ;
2014-08-30 03:21:42 +04:00
return filesize ( m_file ) ;
}
// IsEOF - if we have read past the end of the file
// return - true if end of file has been found
bool File : : IsEOF ( )
{
return ! ! feof ( m_file ) ;
}
// IsWhiteSpace - are the next characters whitespace (space, \t, \r, \n, etc.)?
// skip - skip the whitespace if found (defaults to false)
// returns - true if whitespace found
2016-02-15 07:28:08 +03:00
// TODO: This function actually consumes the white-space characters. Document that behavior.
2014-08-30 03:21:42 +04:00
bool File : : IsWhiteSpace ( bool skip )
{
bool spaceFound = false ;
bool spaceCur = false ;
2016-02-15 07:28:08 +03:00
int c ;
do
2014-08-30 03:21:42 +04:00
{
2016-02-15 07:28:08 +03:00
c = fgetc ( m_file ) ;
if ( c = = EOF ) // hit the end
return spaceFound ;
spaceCur = ! ! isspace ( c ) ;
spaceFound = spaceFound | | spaceCur ;
} while ( spaceCur & & skip ) ;
// put back the last character (EOF is ignored)
ungetc ( c , m_file ) ;
2014-08-30 03:21:42 +04:00
return spaceFound ;
}
// EndOfLineOrEOF - are the next characters an end of line sequence ('\r\n') possibly preceeded by (space, \t)? EOF detected too
// skip - skip the end of line if found (defaults to false)
// returns - true if end of line found, EOF if end of file found, or false if nothing found, in which case any leading space will have been stripped
int File : : EndOfLineOrEOF ( bool skip )
{
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) )
return fskipNewline ( m_file , skip ) ;
else
return false ;
2014-08-30 03:21:42 +04:00
}
2016-11-07 22:40:00 +03:00
// Buffer write stream
int File : : Setvbuf ( )
{
return setvbuf ( this - > m_file , NULL , _IOFBF , WRITE_BUFFER_SIZE ) ;
}
2014-08-30 03:21:42 +04:00
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
File & File : : operator > > ( FileMarker marker )
{
File & file = * this ;
2016-01-18 11:36:14 +03:00
switch ( marker )
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile : // beginning of file marker
// check for Unicode BOM marker
2016-02-15 07:28:08 +03:00
if ( IsTextBased ( ) & & CanSeek ( ) ) // files from a pipe cannot begin with Unicode BOM, sorry
2014-08-30 03:21:42 +04:00
IsUnicodeBOM ( true ) ;
break ;
case fileMarkerEndFile : // end of file marker, should we throw if it's not the end of the file?
if ( ! IsEOF ( ) )
2015-06-30 05:24:18 +03:00
RuntimeError ( " fileMarkerEndFile not found " ) ;
2014-08-30 03:21:42 +04:00
break ;
case fileMarkerBeginList : // Beginning of list marker
// no marker written unless an list with a count header
break ;
case fileMarkerListSeparator : // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break ;
case fileMarkerEndList : // end of line/list marker
if ( IsTextBased ( ) )
{
int found = EndOfLineOrEOF ( true ) ;
2016-01-18 11:36:14 +03:00
if ( found ! = ( int ) true ) // EOF can also be returned
2015-06-30 05:24:18 +03:00
RuntimeError ( " Newline not found " ) ;
2014-08-30 03:21:42 +04:00
}
break ;
case fileMarkerBeginSection : // beginning of section
2016-01-18 11:36:14 +03:00
case fileMarkerEndSection : // end of section
assert ( false ) ; // sections should use a string modifier
2014-08-30 03:21:42 +04:00
break ;
}
return file ;
}
// Get a marker from the file
// some are ignored others are expecting characters
// must use GetMarker methods for those that require parameters
2016-02-15 07:28:08 +03:00
// This function will fail for non-seekable streams.
2014-08-30 03:21:42 +04:00
bool File : : IsMarker ( FileMarker marker , bool skip )
{
bool retval = false ;
2016-01-18 11:36:14 +03:00
switch ( marker )
2014-08-30 03:21:42 +04:00
{
case fileMarkerBeginFile : // beginning of file marker
// check for Unicode BOM marker
retval = IsUnicodeBOM ( skip ) ;
break ;
case fileMarkerEndFile : // end of file marker, should we throw if it's not the end of the file?
retval = IsEOF ( ) ;
break ;
case fileMarkerBeginList : // Beginning of list marker
// no marker written unless an list with a count header
// should we try to validate BOL header (just know it's an int, not negative, etc.)
break ;
case fileMarkerListSeparator : // separate elements of a list
// do nothing for now, built in space deliminter for all types (before type)
// future: make this customizable, so you can specify a separator (i.e. ',')
break ;
case fileMarkerEndList : // end of line/list marker
if ( IsTextBased ( ) )
{
int eolSeen = false ;
eolSeen = EndOfLineOrEOF ( skip ) ;
2016-01-18 11:36:14 +03:00
retval = ( eolSeen = = ( int ) true ) ;
2014-08-30 03:21:42 +04:00
}
break ;
case fileMarkerBeginSection : // beginning of section
2016-01-18 11:36:14 +03:00
case fileMarkerEndSection : // end of section
2014-08-30 03:21:42 +04:00
// can't destinquish from a string currently
break ;
}
return retval ;
}
// GetMarker for beginning of list support (lists with a count)
// count - [out] returns the number of elements in the list
File & File : : GetMarker ( FileMarker marker , size_t & count )
{
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginList ) ;
marker ; // only beginning of list supported for count file markers
2014-08-30 03:21:42 +04:00
// use text based try, so it can fail without an exception
if ( IsTextBased ( ) )
ftrygetText ( m_file , count ) ;
else
fget ( m_file , count ) ;
return * this ;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File & File : : GetMarker ( FileMarker marker , const std : : string & section )
{
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
string str ;
* this > > str ;
if ( str ! = section )
2015-11-26 03:16:21 +03:00
RuntimeError ( " section name mismatch %s != %s " , str . c_str ( ) , section . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
return * this ;
}
// GetMarker for section beginning and ending tags
// section - [in]name of section that is expected
File & File : : GetMarker ( FileMarker marker , const std : : wstring & section )
{
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
wstring str ;
* this > > str ;
if ( str ! = section )
2015-11-26 03:16:21 +03:00
RuntimeError ( " section name mismatch %ls != %ls " , str . c_str ( ) , section . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
return * this ;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File : : TryGetMarker ( FileMarker marker , const std : : wstring & section )
{
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
size_t pos = GetPosition ( ) ;
std : : wstring str ;
try
{
* this > > str ;
if ( str = = section )
return true ;
}
2016-01-18 11:36:14 +03:00
catch ( . . . )
2014-08-30 03:21:42 +04:00
{
2016-01-23 00:58:47 +03:00
// eat
2014-08-30 03:21:42 +04:00
}
SetPosition ( pos ) ;
return false ;
}
// TryGetMarker for section beginning and ending tags
// section - [in]name of section that is expected
bool File : : TryGetMarker ( FileMarker marker , const std : : string & section )
{
// only the section markers take a string parameter
2016-01-18 11:36:14 +03:00
assert ( marker = = fileMarkerBeginSection | | marker = = fileMarkerEndSection ) ;
marker ;
2014-08-30 03:21:42 +04:00
size_t pos = GetPosition ( ) ;
std : : string str ;
try
{
* this > > str ;
if ( str = = section )
return true ;
}
2016-01-18 11:36:14 +03:00
catch ( . . . )
2014-08-30 03:21:42 +04:00
{
return false ;
}
SetPosition ( pos ) ;
return false ;
}
// GetPosition - Get position in a file
2014-10-24 00:14:19 +04:00
uint64_t File : : GetPosition ( )
2014-08-30 03:21:42 +04:00
{
2015-06-30 05:24:18 +03:00
if ( ! CanSeek ( ) )
RuntimeError ( " File: attempted to GetPosition() on non-seekable stream " ) ;
2014-08-30 03:21:42 +04:00
return fgetpos ( m_file ) ;
}
// Set the position in the file
// pos - position in the file
2014-10-24 00:14:19 +04:00
void File : : SetPosition ( uint64_t pos )
2014-08-30 03:21:42 +04:00
{
2015-06-30 05:24:18 +03:00
if ( ! CanSeek ( ) )
RuntimeError ( " File: attempted to SetPosition() on non-seekable stream " ) ;
fsetpos ( m_file , pos ) ;
2014-08-30 03:21:42 +04:00
}
2016-02-15 07:28:08 +03:00
2016-04-04 04:20:01 +03:00
// helper to load a matrix from a stream (file or string literal)
// The input string is expected to contain one line per matrix row (natural printing order for humans).
// Inputs:
// - getLineFn: a lambda that fills a string with the next input line (=next matrix row)
// The lambda returns an empty string to denote the end.
// Outputs:
// - numRows, numCols: matrix dimensions inferred from newlines
// - array: matrix values in column-major order (ready for SetValue())
template < class ElemType , class F >
static void LoadMatrixFromLambda ( const F & getLineFn , const wstring & locationForMsg , vector < ElemType > & array , size_t & /*out*/ numRows , size_t & /*out*/ numCols )
2016-02-15 07:28:08 +03:00
{
// load matrix into vector of vectors (since we don't know the size in advance)
2016-04-04 04:20:01 +03:00
vector < ElemType > vec ;
2016-02-15 07:28:08 +03:00
std : : vector < std : : vector < ElemType > > elements ;
2016-04-04 04:20:01 +03:00
size_t numColsInFirstRow = 0 ;
2016-02-15 07:28:08 +03:00
std : : string line ;
2016-04-04 04:20:01 +03:00
for ( ; ; )
2016-02-15 07:28:08 +03:00
{
2016-04-04 04:20:01 +03:00
// get next input line
getLineFn ( line ) ;
if ( line . empty ( ) )
2016-02-15 07:28:08 +03:00
break ;
// tokenize and parse
vec . clear ( ) ;
const char * p = line . c_str ( ) ;
for ( ; ; )
{
while ( isspace ( ( unsigned char ) * p ) )
p + + ;
if ( ! * p )
break ;
char * ep ; // will be set to point to first character that failed parsing
double value = strtod ( p , & ep ) ;
if ( * ep ! = 0 & & ! isspace ( ( unsigned char ) * ep ) )
2016-04-04 04:20:01 +03:00
RuntimeError ( " LoadMatrixFromTextFile: Malformed number '%.15s...' in row %d of %ls " , p , ( int ) elements . size ( ) , locationForMsg . c_str ( ) ) ;
2016-02-15 07:28:08 +03:00
p = ep ;
vec . push_back ( ( ElemType ) value ) ;
}
size_t numElementsInRow = vec . size ( ) ;
if ( elements . empty ( ) )
numColsInFirstRow = numElementsInRow ;
else if ( numElementsInRow ! = numColsInFirstRow )
2016-04-04 04:20:01 +03:00
RuntimeError ( " Row %d has column dimension %d, inconsistent with previous dimension %d: %ls " , ( int ) elements . size ( ) , ( int ) numElementsInRow , ( int ) numColsInFirstRow , locationForMsg . c_str ( ) ) ;
2016-02-15 07:28:08 +03:00
elements . push_back ( vec ) ;
}
numRows = elements . size ( ) ;
numCols = numColsInFirstRow ;
// Perform transpose when copying elements from vectors to ElemType[],
// in order to store in column-major format.
2016-04-04 04:20:01 +03:00
array . resize ( numRows * numCols ) ;
2016-02-15 07:28:08 +03:00
for ( int i = 0 ; i < numCols ; i + + )
for ( int j = 0 ; j < numRows ; j + + )
array [ i * numRows + j ] = elements [ j ] [ i ] ;
2016-04-04 04:20:01 +03:00
}
2016-02-15 07:28:08 +03:00
2016-04-04 04:20:01 +03:00
// Load matrix from file. The file is a simple text file consisting of one line per matrix row, where each line contains the elements of the row separated by white space.
template < class ElemType >
/*static*/ vector < ElemType > File : : LoadMatrixFromTextFile ( const std : : wstring & filePath , size_t & /*out*/ numRows , size_t & /*out*/ numCols )
{
File myfile ( filePath , FileOptions : : fileOptionsText | FileOptions : : fileOptionsRead ) ;
// LoadMatrixFromLambda() reads its input lines from the following lambda
// return the next input line, or empty string when the end is reached
auto getLineFn = [ & ] ( string & line )
{
while ( ! myfile . IsEOF ( ) )
{
myfile . GetLine ( line ) ;
if ( ! line . empty ( ) )
return ; // got the next line to return
// End of file manifests as an empty line at the end.
// Also, we allow empty lines within the file, as that may help to visually structure matrices that really are >2D tensors.
}
line . clear ( ) ; // empty line indicates end of file
} ;
vector < ElemType > array ;
LoadMatrixFromLambda ( getLineFn , filePath , array , numRows , numCols ) ;
return array ;
}
// Load matrix from file. The file is a simple text file consisting of one line per matrix row, where each line contains the elements of the row separated by white space.
template < class ElemType >
/*static*/ vector < ElemType > File : : LoadMatrixFromStringLiteral ( const std : : string & literal , size_t & /*out*/ numRows , size_t & /*out*/ numCols )
{
// LoadMatrixFromLambda() reads its input lines from the following lambda
// return the next input line, or empty string when the end is reached
size_t pos = 0 ; // cursor for traversing the string. The lambda takes this by reference and modifies it.
auto getLineFn = [ & ] ( string & line )
{
// find first non-blank character of line
pos = literal . find_first_not_of ( " \r \n " , pos ) ; // skip previous line end and any leading spaces
if ( pos = = string : : npos )
return line . clear ( ) ; // hit the end: return empty line
// find end of line
auto endPos = literal . find_first_of ( " \r \n " , pos + 1 ) ; // find line end
if ( endPos = = string : : npos )
endPos = literal . size ( ) ; // no LF required at very end, so that it looks pretty in BS source code
line = literal . substr ( pos , endPos - pos ) ;
pos = endPos ; // and advance cursor (we position it on the LF, which is skipped in next round)
return ;
} ;
vector < ElemType > array ;
LoadMatrixFromLambda ( getLineFn , L " string literal " , array , numRows , numCols ) ;
2016-02-15 07:28:08 +03:00
return array ;
}
template vector < float > File : : LoadMatrixFromTextFile < float > ( const std : : wstring & filePath , size_t & /*out*/ numRows , size_t & /*out*/ numCols ) ;
template vector < double > File : : LoadMatrixFromTextFile < double > ( const std : : wstring & filePath , size_t & /*out*/ numRows , size_t & /*out*/ numCols ) ;
2016-04-04 04:20:01 +03:00
template vector < float > File : : LoadMatrixFromStringLiteral < float > ( const std : : string & literal , size_t & /*out*/ numRows , size_t & /*out*/ numCols ) ;
template vector < double > File : : LoadMatrixFromStringLiteral < double > ( const std : : string & literal , size_t & /*out*/ numRows , size_t & /*out*/ numCols ) ;
2016-02-15 07:28:08 +03:00
} } }