845 строки
37 KiB
C++
845 строки
37 KiB
C++
// ConfigParser.cpp -- config parser (syntactic only, that is, source -> Expression tree)
|
|
|
|
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
|
|
|
#include "BrainScriptParser.h"
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cctype>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <deque>
|
|
#include <set>
|
|
#include <stdexcept>
|
|
#include <algorithm>
|
|
|
|
#ifndef let
|
|
#define let const auto
|
|
#endif
|
|
|
|
namespace Microsoft { namespace MSR { namespace BS {
|
|
|
|
using namespace std;
|
|
using namespace msra::strfun;
|
|
using namespace Microsoft::MSR::CNTK;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// source files and text references (location) into them
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// SourceFile constructors
|
|
SourceFile::SourceFile(wstring location, wstring text) : path(location), lines(split(text, L"\r\n")) { } // from string, e.g. command line
|
|
SourceFile::SourceFile(wstring path, const vector<wstring> & includePaths) : path(path) // from file
|
|
{
|
|
// ... scan paths
|
|
includePaths;
|
|
File(path, fileOptionsRead).GetLines(lines);
|
|
}
|
|
|
|
bool TextLocation::IsValid() const { return sourceFileAsIndex != SIZE_MAX; }
|
|
|
|
// register a new source file and return a TextPosition that points to its start
|
|
/*static*/ TextLocation TextLocation::NewSourceFile(SourceFile && sourceFile)
|
|
{
|
|
TextLocation loc;
|
|
loc.lineNo = 0;
|
|
loc.charPos = 0;
|
|
loc.sourceFileAsIndex = sourceFileMap.size(); // index under which we store the source file
|
|
sourceFileMap.push_back(move(sourceFile)); // take ownership of the source file and give it a numeric index
|
|
return loc;
|
|
}
|
|
|
|
// helper for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
|
|
struct Issue
|
|
{
|
|
TextLocation location; // using lineno and source file; char position only for printing the overall error loc
|
|
wstring markup; // string with markup symbols at char positions and dots inbetween
|
|
void AddMarkup(wchar_t symbol, size_t charPos)
|
|
{
|
|
if (charPos >= markup.size())
|
|
markup.resize(charPos+1, L' '); // fill with '.' up to desired position if the string is not that long yet
|
|
if (markup[charPos] == L' ') // don't overwrite
|
|
markup[charPos] = symbol;
|
|
}
|
|
Issue(TextLocation location) : location(location) { }
|
|
};
|
|
|
|
// trace
|
|
/*static*/ void TextLocation::Trace(TextLocation location, const wchar_t * traceKind, const wchar_t * op, const wchar_t * exprPath)
|
|
{
|
|
fprintf(stderr, "%ls: %ls (path %ls)\n", traceKind, op, exprPath);
|
|
const auto & lines = location.GetSourceFile().lines;
|
|
const auto line = (location.lineNo == lines.size()) ? L"(end)" : lines[location.lineNo].c_str();
|
|
Issue issue(location);
|
|
issue.AddMarkup(L'^', location.charPos);
|
|
fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str());
|
|
}
|
|
|
|
// report an error
|
|
// The source line is shown, and the position is marked as '^'.
|
|
// Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
|
|
// Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
|
|
/*static*/ void TextLocation::PrintIssue(const vector<TextLocation> & locations, const wchar_t * errorKind, const wchar_t * kind, const wchar_t * what)
|
|
{
|
|
vector<Issue> issues; // tracing the error backwards
|
|
size_t symbolIndex = 0;
|
|
for (size_t n = 0; n < locations.size(); n++)
|
|
{
|
|
let & location = locations[n];
|
|
if (!location.IsValid()) // means thrower has no location, go up one context
|
|
continue;
|
|
// build the array
|
|
if (symbolIndex == 0 || location.lineNo != issues.back().location.lineNo || location.sourceFileAsIndex != issues.back().location.sourceFileAsIndex)
|
|
{
|
|
if (issues.size() == 10)
|
|
break;
|
|
else
|
|
issues.push_back(location);
|
|
}
|
|
// get the symbol to indicate how many steps back, in this sequence: ^ 0..9 a..z A..Z (we don't go further than this)
|
|
wchar_t symbol;
|
|
if (symbolIndex == 0) symbol = '^';
|
|
else if (symbolIndex < 1 + 10) symbol = '0' + (wchar_t)symbolIndex - 1;
|
|
else if (symbolIndex < 1 + 10 + 26) symbol = 'a' + (wchar_t)symbolIndex - (1 + 10);
|
|
else if (symbolIndex < 1 + 10 + 26 + 26) symbol = 'A' + (wchar_t)symbolIndex - (1 + 10 + 26);
|
|
else break;
|
|
symbolIndex++;
|
|
// insert the markup
|
|
issues.back().AddMarkup(symbol, location.charPos);
|
|
}
|
|
// print it backwards
|
|
if (!locations.empty()) // (be resilient to some throwers not having a TextrLocation; to be avoided)
|
|
{
|
|
let & firstLoc = issues.front().location;
|
|
fprintf(stderr, "\n%ls while %ls line %d char %d of %ls\n", errorKind, kind, (int)firstLoc.lineNo + 1/*report 1-based*/, (int)firstLoc.charPos + 1, firstLoc.GetSourceFile().path.c_str());
|
|
fprintf(stderr, "see location marked ^ and parent contexts marked 0..9, a..z, A..Z:\n\n");
|
|
for (auto i = issues.rbegin(); i != issues.rend(); i++)
|
|
{
|
|
let & issue = *i;
|
|
auto & where = issue.location;
|
|
const auto & lines = where.GetSourceFile().lines;
|
|
const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
|
|
fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str());
|
|
}
|
|
}
|
|
fprintf(stderr, "%ls: %ls\n", errorKind, what);
|
|
fflush(stderr);
|
|
}
|
|
/*static*/ vector<SourceFile> TextLocation::sourceFileMap;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// reader -- reads source code, including loading from disk
|
|
// ---------------------------------------------------------------------------
|
|
|
|
class CodeSource
|
|
{
|
|
vector<TextLocation> locationStack; // parent locations in case of included files
|
|
TextLocation cursor; // current location
|
|
const wchar_t * currentLine; // cache of cursor.GetSourceFile().lines[cursor.lineNo]
|
|
// update currentLine from cursor
|
|
void CacheCurrentLine()
|
|
{
|
|
let & lines = cursor.GetSourceFile().lines;
|
|
if (cursor.lineNo == lines.size())
|
|
currentLine = nullptr;
|
|
else
|
|
currentLine = lines[cursor.lineNo].c_str();
|
|
}
|
|
protected:
|
|
// set a source file; only do that from constructor or inside PushSourceFile()
|
|
void SetSourceFile(SourceFile && sourceFile)
|
|
{
|
|
cursor = TextLocation::NewSourceFile(move(sourceFile)); // save source file and set the cursor to its start
|
|
CacheCurrentLine(); // re-cache current line
|
|
}
|
|
public:
|
|
class CodeSourceException : public ConfigException
|
|
{
|
|
public:
|
|
CodeSourceException(const wstring & msg, TextLocation where) : ConfigException(msg, where) { }
|
|
/*ConfigException::*/ const wchar_t * kind() const { return L"reading source"; }
|
|
};
|
|
|
|
__declspec_noreturn static void Fail(wstring msg, TextLocation where)
|
|
{
|
|
Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
|
|
throw CodeSourceException(msg, where);
|
|
}
|
|
|
|
// enter a source file, at start or as a result of an include statement
|
|
void PushSourceFile(SourceFile && sourceFile)
|
|
{
|
|
locationStack.push_back(cursor);
|
|
SetSourceFile(move(sourceFile));
|
|
}
|
|
|
|
// are we inside an include file?
|
|
bool IsInInclude() { return locationStack.size() > 0; }
|
|
|
|
// done with a source file. Only call this for nested files; the outermost one must not be popped.
|
|
void PopSourceFile()
|
|
{
|
|
if (!IsInInclude())
|
|
LogicError("PopSourceFile: location stack empty");
|
|
cursor = locationStack.back(); // restore cursor we came from
|
|
CacheCurrentLine(); // re-cache current line
|
|
locationStack.pop_back();
|
|
}
|
|
|
|
// get current cursor; this is remembered for each token, and also used when throwing errors
|
|
TextLocation GetCursor() const { return cursor; }
|
|
|
|
// get character at current position.
|
|
// Special cases:
|
|
// - end of line is returned as '\n'
|
|
// - end of file is returned as 0
|
|
wchar_t GotChar() const
|
|
{
|
|
if (!currentLine) return 0; // end of file
|
|
else if (!currentLine[cursor.charPos]) return '\n'; // end of line
|
|
else return currentLine[cursor.charPos];
|
|
}
|
|
|
|
// we chan also return the address of the current character, e.g. for passing it to a C stdlib funcion such as wcstod()
|
|
const wchar_t * GotCharPtr() const { return currentLine + cursor.charPos; }
|
|
|
|
// advance cursor by #chars (but across line boundaries)
|
|
void ConsumeChars(size_t chars)
|
|
{
|
|
let ch = GotChar();
|
|
if (!ch) LogicError("Consume: cannot run beyond end of source file");
|
|
if (ch == '\n' && chars > 0)
|
|
{
|
|
if (chars != 1) LogicError("Consume: cannot run beyond end of line");
|
|
cursor.lineNo++;
|
|
CacheCurrentLine(); // line no has changed: re-cache the line ptr
|
|
cursor.charPos = 0;
|
|
}
|
|
else
|
|
cursor.charPos += chars;
|
|
}
|
|
|
|
// get the next character
|
|
wchar_t GetChar()
|
|
{
|
|
ConsumeChars(1);
|
|
return GotChar();
|
|
}
|
|
};
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// lexer -- iterates over the source code and returns token by token
|
|
// ---------------------------------------------------------------------------
|
|
|
|
class Lexer : public CodeSource
|
|
{
|
|
set<wstring> keywords;
|
|
set<wstring> punctuations;
|
|
vector<wstring> includePaths;
|
|
public:
|
|
Lexer(vector<wstring> && includePaths) : CodeSource(), includePaths(includePaths), currentToken(TextLocation())
|
|
{
|
|
keywords = set<wstring>
|
|
{
|
|
L"include",
|
|
L"new", L"with", L"true", L"false",
|
|
L"if", L"then", L"else",
|
|
L"array",
|
|
};
|
|
punctuations = set<wstring>
|
|
{
|
|
L"=", L";", L",", L"\n",
|
|
L"[", L"]", L"(", L")",
|
|
L"+", L"-", L"*", L"/", L"**", L".*", L"%", L"||", L"&&", L"^",
|
|
L"!",
|
|
L"==", L"!=", L"<", L"<=", L">", L">=",
|
|
L":", L"=>",
|
|
L"..", L".",
|
|
L"//", L"#", L"/*"
|
|
};
|
|
}
|
|
|
|
enum TokenKind
|
|
{
|
|
invalid, punctuation, numberliteral, stringliteral, booleanliter, identifier, keyword, eof // TODO: what are true and false? Literals or identifiers?
|
|
};
|
|
|
|
struct Token
|
|
{
|
|
wstring symbol; // identifier, keyword, punctuation, or string literal
|
|
double number; // number
|
|
TokenKind kind;
|
|
TextLocation beginLocation; // text loc of first character of this token
|
|
bool isLineInitial; // this token is the first on the line (ignoring comments)
|
|
Token(TextLocation loc) : beginLocation(loc), kind(invalid), number(0.0), isLineInitial(false) { }
|
|
// diagnostic helper
|
|
static wstring TokenKindToString(TokenKind kind)
|
|
{
|
|
switch (kind)
|
|
{
|
|
case invalid: return L"invalid";
|
|
case punctuation: return L"punctuation";
|
|
case numberliteral: return L"numberliteral";
|
|
case stringliteral: return L"stringliteral";
|
|
case identifier: return L"identifier";
|
|
case keyword: return L"keyword";
|
|
case eof: return L"eof";
|
|
default: return L"(unknown?)";
|
|
}
|
|
}
|
|
wstring ToString() const // string to show the content of token for debugging
|
|
{
|
|
let kindStr = TokenKindToString(kind);
|
|
switch (kind)
|
|
{
|
|
case numberliteral: return kindStr + wstrprintf(L" %f", number);
|
|
case stringliteral: return kindStr + L" '" + symbol + L"'";
|
|
case identifier: case keyword: case punctuation: return kindStr + L" " + symbol;
|
|
default: return kindStr;
|
|
}
|
|
}
|
|
};
|
|
|
|
class LexerException : public ConfigException
|
|
{
|
|
public:
|
|
LexerException(const wstring & msg, TextLocation where) : ConfigException(msg, where) { }
|
|
/*ConfigException::*/ const wchar_t * kind() const { return L"tokenizing"; }
|
|
};
|
|
|
|
private:
|
|
__declspec_noreturn static void Fail(wstring msg, Token where)
|
|
{
|
|
Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
|
|
throw LexerException(msg, where.beginLocation);
|
|
}
|
|
|
|
Token currentToken;
|
|
// consume input characters to form a next token
|
|
// - this function mutates the cursor, but does not set currentToken
|
|
// - white space and comments are skipped
|
|
// - including files is handled here
|
|
// - the cursor is left on the first character that does not belong to the token
|
|
// TODO: need to know whether we want to see '\n' or not
|
|
Token NextToken()
|
|
{
|
|
auto ch = GotChar();
|
|
// skip white space
|
|
// We remember whether we crossed a line end. Dictionary assignments end at newlines if syntactically acceptable.
|
|
bool crossedLineEnd = (GetCursor().lineNo == 0 && GetCursor().charPos == 0);
|
|
while (iswblank(ch) || ch == '\n' || ch == '\r')
|
|
{
|
|
crossedLineEnd |= (ch == '\n' || ch == '\r');
|
|
ch = GetChar();
|
|
}
|
|
Token t(GetCursor());
|
|
t.isLineInitial = crossedLineEnd;
|
|
// handle end of (include) file
|
|
if (ch == 0)
|
|
{
|
|
if (IsInInclude())
|
|
{
|
|
PopSourceFile();
|
|
t = NextToken(); // tail call--the current 't' gets dropped/ignored
|
|
t.isLineInitial = true; // eof is a line end
|
|
return t;
|
|
}
|
|
// really end of all source code: we are done. If calling this function multiple times, we will keep returning this.
|
|
t.kind = eof;
|
|
}
|
|
else if (iswdigit(ch) || (ch == L'.' && iswdigit(GotCharPtr()[1]))) // --- number
|
|
{
|
|
let beginPtr = GotCharPtr();
|
|
wchar_t * endPtr = nullptr;
|
|
t.number = wcstod(beginPtr, &endPtr); // BUGBUG: this seems to honor locale settings. We need one that doesn't. With this, CNTK won't parse right in Germany.
|
|
if (endPtr == beginPtr) Fail(L"parsing number", t); // should not really happen!
|
|
t.kind = numberliteral;
|
|
if (endPtr[0] == L'.' && endPtr[-1] == L'.') // prevent 1..2 from begin tokenized 1. .2
|
|
endPtr--;
|
|
ConsumeChars(endPtr - beginPtr);
|
|
}
|
|
else if (iswalpha(ch) || ch == L'_') // --- identifier or keyword
|
|
{
|
|
while (iswalpha(ch) || ch == L'_' || iswdigit(ch)) // inside we also allow digits
|
|
{
|
|
t.symbol.push_back(ch);
|
|
ch = GetChar();
|
|
}
|
|
// check against keyword list
|
|
if (keywords.find(t.symbol) != keywords.end()) t.kind = keyword;
|
|
else t.kind = identifier;
|
|
// special case: include "path"
|
|
if (t.symbol == L"include")
|
|
{
|
|
let nameTok = NextToken(); // must be followed by a string literal
|
|
if (nameTok.kind != stringliteral) Fail(L"'include' must be followed by a quoted string", nameTok);
|
|
let path = nameTok.symbol; // TODO: some massaging of the path
|
|
PushSourceFile(SourceFile(path, includePaths)); // current cursor is right after the pathname; that's where we will pick up later
|
|
return NextToken();
|
|
}
|
|
}
|
|
else if (ch == L'"' || ch == 0x27) // --- string literal
|
|
{
|
|
t.kind = stringliteral;
|
|
let q = ch; // remember quote character
|
|
ch = GetChar(); // consume the quote character
|
|
while (ch != 0 && ch != q) // note: our strings do not have any escape characters to consider
|
|
{
|
|
t.symbol.append(1, ch);
|
|
ch = GetChar();
|
|
}
|
|
if (ch == 0) // runaway string
|
|
Fail(L"string without closing quotation mark", t);
|
|
GetChar(); // consume the closing quote
|
|
}
|
|
else // --- punctuation
|
|
{
|
|
t.kind = punctuation;
|
|
t.symbol = ch;
|
|
t.symbol.append(1, GetChar()); // first try two-char punctuation
|
|
if (punctuations.find(t.symbol) != punctuations.end())
|
|
GetChar(); // it is a two-char one: need to consume the second one of them
|
|
else // try single-char one
|
|
{
|
|
t.symbol.pop_back(); // drop the last one & try again
|
|
if (punctuations.find(t.symbol) == punctuations.end()) // unknown
|
|
Fail(L"unexpected character: " + t.symbol, t);
|
|
}
|
|
// special case: comments
|
|
if (t.symbol == L"#" || t.symbol == L"//")
|
|
{
|
|
ConsumeChars(wcslen(GotCharPtr()));
|
|
return NextToken();
|
|
}
|
|
else if (t.symbol == L"/*")
|
|
{
|
|
ch = GotChar();
|
|
while (ch != 0 && !(ch == L'*' && GetChar() == L'/')) // note: this test leverages short-circuit evaluation semantics of C
|
|
ch = GetChar();
|
|
if (ch == 0)
|
|
Fail(L"comment without closing */", t);
|
|
GetChar(); // consume the final '/'
|
|
return NextToken(); // and return the next token
|
|
}
|
|
}
|
|
return t;
|
|
}
|
|
public:
|
|
const Token & GotToken() { return currentToken; }
|
|
void ConsumeToken() { currentToken = NextToken(); }
|
|
const Token & GetToken()
|
|
{
|
|
ConsumeToken();
|
|
return GotToken();
|
|
}
|
|
|
|
// some simple test function
|
|
void Test()
|
|
{
|
|
let lexerTest = L"new CNTK [ do = (train:eval) # main\ntrain=/*test * */if eval include 'c:/me/test.txt' then 13 else array[1..10](i=>i*i); eval=\"a\"+'b' // line-end\n ] 'a\nb\nc' new";
|
|
PushSourceFile(SourceFile(L"(command line)", lexerTest));
|
|
while (GotToken().kind != Lexer::TokenKind::eof)
|
|
{
|
|
let & token = GotToken(); // get first token
|
|
fprintf(stderr, "%ls\n", token.ToString().c_str());
|
|
ConsumeToken();
|
|
}
|
|
Fail(L"error test", GetCursor());
|
|
}
|
|
};
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// parser -- parses configurations
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// diagnostics helper: print the content
|
|
void Expression::Dump(int indent) const
|
|
{
|
|
fprintf(stderr, "%*s", indent, "");
|
|
if (op == L"s") fprintf(stderr, "'%ls' ", s.c_str());
|
|
else if (op == L"d") fprintf(stderr, "%.f ", d);
|
|
else if (op == L"b") fprintf(stderr, "%s ", b ? "true" : "false");
|
|
else if (op == L"id") fprintf(stderr, "%ls ", id.c_str());
|
|
else if (op == L"new" || op == L"array" || op == L".") fprintf(stderr, "%ls %ls ", op.c_str(), id.c_str());
|
|
else fprintf(stderr, "%ls ", op.c_str());
|
|
if (!args.empty())
|
|
{
|
|
fprintf(stderr, "\n");
|
|
for (const auto & arg : args)
|
|
arg->Dump(indent + 2);
|
|
}
|
|
if (!namedArgs.empty())
|
|
{
|
|
fprintf(stderr, "\n");
|
|
for (const auto & arg : namedArgs)
|
|
{
|
|
fprintf(stderr, "%*s%ls =\n", indent + 2, "", arg.first.c_str());
|
|
arg.second.second->Dump(indent + 4);
|
|
}
|
|
}
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
class Parser : public Lexer
|
|
{
|
|
// errors
|
|
class ParseException : public ConfigException
|
|
{
|
|
public:
|
|
ParseException(const wstring & msg, TextLocation where) : ConfigException(msg, where) { }
|
|
/*ConfigException::*/ const wchar_t * kind() const { return L"parsing"; }
|
|
};
|
|
|
|
__declspec_noreturn static void Fail(const wstring & msg, Token where)
|
|
{
|
|
Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
|
|
throw ParseException(msg, where.beginLocation);
|
|
}
|
|
|
|
//void Expected(const wstring & what) { Fail(strprintf("%ls expected", what.c_str()), GotToken().beginLocation); } // I don't know why this does not work
|
|
void Expected(const wstring & what) { Fail(what + L" expected", GotToken().beginLocation); }
|
|
|
|
// this token must be punctuation 's'; check and get the next
|
|
void ConsumePunctuation(const wchar_t * s)
|
|
{
|
|
let & tok = GotToken();
|
|
if (tok.kind != punctuation || tok.symbol != s)
|
|
Expected(L"'" + wstring(s) + L"'");
|
|
ConsumeToken();
|
|
}
|
|
|
|
// this token must be keyword 's'; check and get the next
|
|
void ConsumeKeyword(const wchar_t * s)
|
|
{
|
|
let & tok = GotToken();
|
|
if (tok.kind != keyword || tok.symbol != s)
|
|
Expected(L"'" + wstring(s) + L"'");
|
|
ConsumeToken();
|
|
}
|
|
|
|
// this token must be an identifier; check and get the next token. Return the identifier.
|
|
wstring ConsumeIdentifier()
|
|
{
|
|
let & tok = GotToken();
|
|
if (tok.kind != identifier)
|
|
Expected(L"identifier");
|
|
let id = tok.symbol;
|
|
ConsumeToken();
|
|
return id;
|
|
}
|
|
|
|
map<wstring, int> infixPrecedence; // precedence level of infix operators
|
|
public:
|
|
Parser(SourceFile && sourceFile, vector<wstring> && includePaths) : Lexer(move(includePaths))
|
|
{
|
|
infixPrecedence = map<wstring, int>
|
|
{
|
|
{ L".", 100 }, { L"[", 100 }, { L"(", 100 }, // also sort-of infix operands...
|
|
{ L"*", 10 }, { L"/", 10 }, { L".*", 10 }, { L"**", 10 }, { L"%", 10 },
|
|
{ L"+", 9 }, { L"-", 9 },
|
|
{ L"with", 9 },
|
|
{ L"==", 8 }, { L"!=", 8 }, { L"<", 8 }, { L"<=", 8 }, { L">", 8 }, { L">=", 8 },
|
|
{ L"&&", 7 },
|
|
{ L"||", 6 },
|
|
{ L":", 5 },
|
|
{ L"=>", 0 },
|
|
};
|
|
SetSourceFile(move(sourceFile));
|
|
ConsumeToken(); // get the very first token
|
|
}
|
|
ExpressionPtr OperandFromTokenSymbol(const Token & tok) // helper to make an Operand expression with op==tok.symbol and then consume it
|
|
{
|
|
auto operand = make_shared<Expression>(tok.beginLocation, tok.symbol);
|
|
ConsumeToken();
|
|
return operand;
|
|
}
|
|
ExpressionPtr ParseOperand(bool stopAtNewline)
|
|
{
|
|
let & tok = GotToken();
|
|
ExpressionPtr operand;
|
|
if (tok.kind == numberliteral) // === numeral literal
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, L"d", tok.number, wstring(), false);
|
|
ConsumeToken();
|
|
}
|
|
else if (tok.kind == stringliteral) // === string literal
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, L"s", 0.0, tok.symbol, false);
|
|
ConsumeToken();
|
|
}
|
|
else if (tok.symbol == L"true" || tok.symbol == L"false") // === boolean literal
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, L"b", 0.0, wstring(), (tok.symbol == L"true"));
|
|
ConsumeToken();
|
|
}
|
|
else if (tok.kind == identifier) // === dict member (unqualified)
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, L"id");
|
|
operand->id = ConsumeIdentifier();
|
|
}
|
|
else if (tok.symbol == L"+" || tok.symbol == L"-" // === unary operators
|
|
|| tok.symbol == L"!")
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, tok.symbol + L"("); // encoded as +( -( !(
|
|
ConsumeToken();
|
|
operand->args.push_back(ParseExpression(100, stopAtNewline));
|
|
}
|
|
else if (tok.symbol == L"new") // === new class instance
|
|
{
|
|
operand = OperandFromTokenSymbol(tok);
|
|
operand->id = ConsumeIdentifier();
|
|
operand->args.push_back(ParseOperand(stopAtNewline));
|
|
}
|
|
else if (tok.symbol == L"if") // === conditional expression
|
|
{
|
|
operand = OperandFromTokenSymbol(tok);
|
|
operand->args.push_back(ParseExpression(0, false)); // [0] condition
|
|
ConsumeKeyword(L"then");
|
|
operand->args.push_back(ParseExpression(0, false)); // [1] then expression
|
|
ConsumeKeyword(L"else");
|
|
operand->args.push_back(ParseExpression(0, false)); // [2] else expression
|
|
}
|
|
else if (tok.symbol == L"(") // === nested parentheses
|
|
{
|
|
ConsumeToken();
|
|
operand = ParseExpression(0, false/*go across newlines*/); // just return the content of the parens (they do not become part of the expression tree)
|
|
ConsumePunctuation(L")");
|
|
}
|
|
else if (tok.symbol == L"[") // === dictionary constructor
|
|
{
|
|
operand = make_shared<Expression>(tok.beginLocation, L"[]");
|
|
ConsumeToken();
|
|
operand->namedArgs = ParseRecordMembers();
|
|
ConsumePunctuation(L"]");
|
|
}
|
|
else if (tok.symbol == L"array") // === array constructor
|
|
{
|
|
operand = OperandFromTokenSymbol(tok);
|
|
ConsumePunctuation(L"[");
|
|
operand->args.push_back(ParseExpression(0, false)); // [0] first index
|
|
ConsumePunctuation(L"..");
|
|
operand->args.push_back(ParseExpression(0, false)); // [1] last index
|
|
ConsumePunctuation(L"]");
|
|
ConsumePunctuation(L"(");
|
|
operand->args.push_back(ParseExpression(0, false)); // [2] one-argument lambda to initialize
|
|
ConsumePunctuation(L")");
|
|
}
|
|
else
|
|
Expected(L"operand");
|
|
return operand; // not using returns above to avoid "not all control paths return a value"
|
|
}
|
|
ExpressionPtr ParseExpression(int requiredPrecedence, bool stopAtNewline)
|
|
{
|
|
auto left = ParseOperand(stopAtNewline); // get first operand
|
|
for (;;)
|
|
{
|
|
let & opTok = GotToken();
|
|
// BUGBUG: 'stopAtNewline' is broken.
|
|
// It does not prevent "a = 13 b = 42" from being accepted.
|
|
// On the other hand, it would prevent the totally valid "dict \n with dict2".
|
|
// A correct solution should require "a = 13 ; b = 42", i.e. a semicolon or newline,
|
|
// while continuing to parse across newlines when syntactically meaningful (there is no ambiguity in BrainScript).
|
|
//if (stopAtNewline && opTok.isLineInitial)
|
|
// break;
|
|
let opIter = infixPrecedence.find(opTok.symbol);
|
|
if (opIter == infixPrecedence.end()) // not an infix operator: we are done here, 'left' is our expression
|
|
break;
|
|
let opPrecedence = opIter->second;
|
|
if (opPrecedence < requiredPrecedence) // operator below required precedence level: does not belong to this sub-expression
|
|
break;
|
|
let op = opTok.symbol;
|
|
auto operation = make_shared<Expression>(opTok.beginLocation, op, left); // [0] is left operand; we will add [1] except for macro application
|
|
// deal with special cases first
|
|
// We treat member lookup (.), macro application (a()), and indexing (a[i]) together with the true infix operators.
|
|
if (op == L".") // === reference of a dictionary item
|
|
{
|
|
ConsumeToken();
|
|
operation->location = GotToken().beginLocation; // location of the identifier after the .
|
|
operation->id = ConsumeIdentifier();
|
|
}
|
|
else if (op == L"=>")
|
|
{
|
|
if (left->op != L"id") // currently only allow for a single argument
|
|
Expected(L"identifier");
|
|
ConsumeToken();
|
|
let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a '()' macro-args expression
|
|
// TODO: test parsing of i => j => i*j
|
|
let body = ParseExpression(opPrecedence, stopAtNewline); // pass same precedence; this makes '=>' right-associative e.g.i=>j=>i*j
|
|
operation->args[0] = macroArgs; // [0]: parameter list
|
|
operation->args.push_back(body); // [1]: right operand
|
|
}
|
|
else if (op == L"(") // === macro application
|
|
{
|
|
// op = "(" means 'apply'
|
|
// args[0] = lambda expression (lambda: op="=>", args[0] = param list, args[1] = expression with unbound vars)
|
|
// args[1] = arguments (arguments: op="(), args=vector of expressions, one per arg; and namedArgs)
|
|
operation->args.push_back(ParseMacroArgs(false)); // [1]: all arguments
|
|
}
|
|
else if (op == L"[") // === array index
|
|
{
|
|
ConsumeToken();
|
|
operation->args.push_back(ParseExpression(0, false)); // [1]: index
|
|
ConsumePunctuation(L"]");
|
|
}
|
|
else if (op == L":")
|
|
{
|
|
// special case: (a : b : c) gets flattened into :(a,b,c) i.e. an operation with possibly >2 operands
|
|
ConsumeToken();
|
|
let right = ParseExpression(opPrecedence + 1, stopAtNewline); // get right operand, or entire multi-operand expression with higher precedence
|
|
if (left->op == L":") // appending to a list: flatten it
|
|
{
|
|
operation->args = left->args;
|
|
operation->location = left->location; // location of first ':' (we need to choose some location)
|
|
}
|
|
operation->args.push_back(right); // form a list of multiple operands (not just two)
|
|
}
|
|
else // === regular infix operator
|
|
{
|
|
ConsumeToken();
|
|
let right = ParseExpression(opPrecedence + 1, stopAtNewline); // get right operand, or entire multi-operand expression with higher precedence
|
|
operation->args.push_back(right); // [1]: right operand
|
|
}
|
|
left = operation;
|
|
}
|
|
return left;
|
|
}
|
|
// a macro-args expression lists position-dependent and optional parameters
|
|
// This is used both for defining macros (LHS) and using macros (RHS).
|
|
// Result:
|
|
// op = "()"
|
|
// args = vector of arguments (which are given comma-separated)
|
|
// In case of macro definition, all arguments must be of type "id". Pass 'defining' to check for that.
|
|
// namedArgs = dictionary of optional args
|
|
// In case of macro definition, dictionary values are default values that are used if the argument is not given
|
|
ExpressionPtr ParseMacroArgs(bool defining)
|
|
{
|
|
ConsumePunctuation(L"(");
|
|
auto macroArgs = make_shared<Expression>(GotToken().beginLocation, L"()");
|
|
if (GotToken().symbol != L")") // x() defines an empty argument list
|
|
{
|
|
for (;;)
|
|
{
|
|
let expr = ParseExpression(0, false); // this could be an optional arg (var = val)
|
|
if (defining && expr->op != L"id") // when defining we only allow a single identifier
|
|
Fail(L"argument identifier expected", expr->location);
|
|
if (expr->op == L"id" && GotToken().symbol == L"=")
|
|
{
|
|
let id = expr->id; // 'expr' gets resolved (to 'id') and forgotten
|
|
ConsumeToken();
|
|
let defValueExpr = ParseExpression(0, false); // default value
|
|
let res = macroArgs->namedArgs.insert(make_pair(id, make_pair(expr->location, defValueExpr)));
|
|
if (!res.second)
|
|
Fail(L"duplicate optional parameter '" + id + L"'", expr->location);
|
|
}
|
|
else
|
|
macroArgs->args.push_back(expr); // [0..]: position args
|
|
if (GotToken().symbol != L",")
|
|
break;
|
|
ConsumeToken();
|
|
}
|
|
}
|
|
ConsumePunctuation(L")");
|
|
return macroArgs;
|
|
}
|
|
map<wstring, pair<TextLocation,ExpressionPtr>> ParseRecordMembers()
|
|
{
|
|
// A dictionary is a map
|
|
// member identifier -> expression
|
|
// Macro declarations are translated into lambdas, e.g.
|
|
// F(A,B) = expr(A,B)
|
|
// gets represented in the dictionary as
|
|
// F = (A,B) => expr(A,B)
|
|
// where a lambda expression has this structure:
|
|
// op="=>"
|
|
// args[0] = parameter list (op="()", with args (all of op="id") and namedArgs)
|
|
// args[1] = expression with unbound arguments
|
|
// An array constructor of the form
|
|
// V[i:from..to] = expression of i
|
|
// gets mapped to the explicit array operator
|
|
// V = array[from..to] (i => expression of i)
|
|
map<wstring, pair<TextLocation,ExpressionPtr>> members;
|
|
auto idTok = GotToken();
|
|
while (idTok.kind == identifier)
|
|
{
|
|
let location = idTok.beginLocation; // for error message
|
|
let id = ConsumeIdentifier(); // the member's name
|
|
// optional array constructor
|
|
ExpressionPtr arrayIndexExpr, fromExpr, toExpr;
|
|
if (GotToken().symbol == L"[")
|
|
{
|
|
// X[i:from..to]
|
|
ConsumeToken();
|
|
arrayIndexExpr = ParseOperand(false); // 'i' name of index variable
|
|
if (arrayIndexExpr->op != L"id")
|
|
Expected(L"identifier");
|
|
ConsumePunctuation(L":");
|
|
fromExpr = ParseExpression(0, false); // 'from' start index
|
|
ConsumePunctuation(L"..");
|
|
toExpr = ParseExpression(0, false); // 'to' end index
|
|
ConsumePunctuation(L"]");
|
|
}
|
|
// optional macro args
|
|
let parameters = (GotToken().symbol == L"(") ? ParseMacroArgs(true/*defining*/) : ExpressionPtr(); // optionally, macro arguments
|
|
ConsumePunctuation(L"=");
|
|
auto rhs = ParseExpression(0, true/*can end at newline*/); // and the right-hand side
|
|
// if macro then rewrite it as an assignment of a lambda expression
|
|
if (parameters)
|
|
rhs = make_shared<Expression>(parameters->location, L"=>", parameters, rhs);
|
|
// if array then rewrite it as an assignment of a array-constructor expression
|
|
if (arrayIndexExpr)
|
|
{
|
|
// create a lambda expression over the index variable
|
|
let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a '()' macro-args expression
|
|
let initLambdaExpr = make_shared<Expression>(arrayIndexExpr->location, L"=>", macroArgs, rhs); // [0] is id, [1] is body
|
|
rhs = make_shared<Expression>(location, L"array");
|
|
rhs->args.push_back(fromExpr); // [0] first index
|
|
rhs->args.push_back(toExpr); // [1] last index
|
|
rhs->args.push_back(initLambdaExpr); // [2] one-argument lambda to initialize
|
|
}
|
|
// insert
|
|
let res = members.insert(make_pair(id, make_pair(location, rhs)));
|
|
if (!res.second)
|
|
Fail(L"duplicate member definition '" + id + L"'", location);
|
|
// advance
|
|
idTok = GotToken();
|
|
if (idTok.symbol == L";")
|
|
idTok = GetToken();
|
|
}
|
|
return members;
|
|
}
|
|
void VerifyAtEnd()
|
|
{
|
|
if (GotToken().kind != eof)
|
|
Fail(L"junk at end of source", GetCursor());
|
|
}
|
|
// top-level parse function parses dictonary members without enclosing [ ... ] and returns it as a dictionary
|
|
ExpressionPtr ParseRecordMembersToDict()
|
|
{
|
|
let topMembers = ParseRecordMembers();
|
|
VerifyAtEnd();
|
|
ExpressionPtr topDict = make_shared<Expression>(GetCursor(), L"[]");
|
|
topDict->namedArgs = topMembers;
|
|
return topDict;
|
|
}
|
|
// simple test function for use during development
|
|
static void Test()
|
|
{
|
|
let parserTest = L"a=1\na1_=13;b=2 // cmt\ndo = (print\n:train:eval) ; x = array[1..13] (i=>1+i*print.message==13*42) ; print = new PrintAction [ message = 'Hello World' ]";
|
|
ParseConfigDictFromString(parserTest, vector<wstring>())->Dump();
|
|
}
|
|
};
|
|
|
|
// globally exported functions to execute the parser
|
|
static ExpressionPtr Parse(SourceFile && sourceFile, vector<wstring> && includePaths) { return Parser(move(sourceFile), move(includePaths)).ParseRecordMembersToDict(); }
|
|
ExpressionPtr ParseConfigDictFromString(wstring text, vector<wstring> && includePaths) { return Parse(SourceFile(L"(command line)", text), move(includePaths)); }
|
|
ExpressionPtr ParseConfigDictFromFile(wstring path, vector<wstring> && includePaths) { auto sourceFile = SourceFile(path, includePaths); return Parse(move(sourceFile), move(includePaths)); }
|
|
ExpressionPtr ParseConfigExpression(const wstring & sourceText, vector<wstring> && includePaths)
|
|
{
|
|
auto parser = Parser(SourceFile(L"(command line)", sourceText), move(includePaths));
|
|
auto expr = parser.ParseExpression(0, true/*can end at newline*/);
|
|
parser.VerifyAtEnd();
|
|
return expr;
|
|
}
|
|
|
|
}}} // namespaces
|