pict/cli/ctokenizer.cpp

815 строки
23 KiB
C++

#include <valarray>
#include <sstream>
#include "ctokenizer.h"
using namespace std;
namespace pictcli_constraints
{
//
// handled by parseConstraint()
//
#define TEXT_TokenKeywordIf L"IF"
#define TEXT_TokenKeywordThen L"THEN"
#define TEXT_TokenKeywordElse L"ELSE"
//
// handled by getValueSet()
//
#define TEXT_TokenValueSetOpen L"{"
#define TEXT_TokenValueSetSeparator L","
#define TEXT_TokenValueSetClose L"}"
//
// handled by getParameterName()
//
// defined in cp.h as it's used by csolver.cpp
#define TEXT_TokenParameterNameOpen L"["
#define TEXT_TokenParameterNameClose L"]"
//
// handled by parseCondition() and getFunction()
//
#define TEXT_TokenParenthesisOpen L"("
#define TEXT_TokenParenthesisClose L")"
//
// handled by getFunction()
//
#define TEXT_FunctionIsNegativeParam L"ISNEGATIVE"
#define TEXT_FunctionIsPositiveParam L"ISPOSITIVE"
//
// handled by parseTerm()
//
#define TEXT_TokenQuotes L"\""
//
// handled by getRelation()
//
#define TEXT_TokenRelationEQ L"="
#define TEXT_TokenRelationNE L"<>"
#define TEXT_TokenRelationLT L"<"
#define TEXT_TokenRelationLE L"<="
#define TEXT_TokenRelationGT L">"
#define TEXT_TokenRelationGE L">="
#define TEXT_TokenRelationIN L"IN"
#define TEXT_TokenRelationLIKE L"LIKE"
//
// handled by getLogicalOper()
//
#define TEXT_TokenLogicalOperAND L"AND"
#define TEXT_TokenLogicalOperOR L"OR"
//
// not handled by any function because of grammar; used directly
//
#define TEXT_TokenLogicalOperNOT L"NOT"
//
// Special characters recognized within a string
//
#define TEXT_SpecialCharMarker L'\\'
//
// create an array of special characters, then populate valarray with it
//
const wchar_t SpecialCharacters[] = { TEXT_SpecialCharMarker, L'"', L']' };
//
//
//
void ConstraintsTokenizer::Tokenize()
{
_tokenLists.clear();
while( _currentPosition < _constraintsText.end() )
{
CTokenList tokenList;
parseConstraint( tokenList );
_tokenLists.push_back( tokenList );
skipWhiteChars();
}
}
//
//
//
void ConstraintsTokenizer::cleanUpTokenLists()
{
for( auto & tokenList : _tokenLists )
for( auto & token : tokenList )
delete( token );
}
//
// Parses a constraint:
//
// constraint ::= IF <clause> THEN <term>;
// IF <clause> THEN <term> ELSE <term>;
// <parameter_name> <relation> <parameter_name>;
//
void ConstraintsTokenizer::parseConstraint( IN OUT CTokenList& tokens )
{
skipWhiteChars();
// save position in case a new token is created
wstring::iterator position = _currentPosition;
// IF <clause> THEN <clause> ELSE <clause>
// <clause>
if ( isNextSubstring( wstring(TEXT_TokenKeywordIf)) )
{
CToken* tokenKeywordIf = new CToken( TokenType::KeywordIf, position );
tokens.push_back( tokenKeywordIf );
skipWhiteChars();
parseClause( tokens );
skipWhiteChars();
position = _currentPosition;
if ( isNextSubstring( charArrToStr( TEXT_TokenKeywordThen )))
{
CToken* tokenKeywordThen = new CToken( TokenType::KeywordThen, position );
tokens.push_back( tokenKeywordThen );
}
else
{
throw CSyntaxError( SyntaxErrorType::NoKeywordThen, _currentPosition );
}
}
// evaluate the THEN part
parseClause( tokens );
// evaluate the ELSE part
skipWhiteChars();
position = _currentPosition;
if ( isNextSubstring( charArrToStr( TEXT_TokenKeywordElse )))
{
CToken* tokenKeywordElse = new CToken( TokenType::KeywordElse, position );
tokens.push_back( tokenKeywordElse );
parseClause( tokens );
}
// all forms of contraints should end with a termination marker
skipWhiteChars();
position = _currentPosition;
if ( ! isNextSubstring ( charArrToStr( TEXT_TokenConstraintEnd )))
{
throw CSyntaxError( SyntaxErrorType::NoConstraintEnd, _currentPosition );
}
// some functions are like macros so do the expansions on the token list
doPostParseExpansions( tokens );
}
//
// Parses a clause:
//
// clause ::= <condition>
// <condition> <logical_operator> <clause>
//
void ConstraintsTokenizer::parseClause( IN OUT CTokenList& tokens )
{
skipWhiteChars();
parseCondition( tokens );
// getLogicalOper() may change the current position so preserve it for token creation
skipWhiteChars();
wstring::iterator position = _currentPosition;
LogicalOper logicalOper = getLogicalOper();
if ( LogicalOper::Unknown != logicalOper )
{
CToken* token = new CToken( logicalOper, position );
tokens.push_back( token );
skipWhiteChars();
parseClause( tokens );
}
}
//
// Parses a condition:
//
// condition ::= <term>
// (<clause>)
// NOT <clause>
//
void ConstraintsTokenizer::parseCondition( IN OUT CTokenList& tokens )
{
skipWhiteChars();
wstring::iterator position = _currentPosition;
// (<clause>)
if ( isNextSubstring( charArrToStr( TEXT_TokenParenthesisOpen )))
{
CToken* token = new CToken( TokenType::ParenthesisOpen, position );;
tokens.push_back( token );
skipWhiteChars();
parseClause( tokens );
skipWhiteChars();
position = _currentPosition;
if ( isNextSubstring( charArrToStr( TEXT_TokenParenthesisClose )))
{
token = new CToken( TokenType::ParenthesisClose, position );
tokens.push_back( token );
}
else
{
throw CSyntaxError( SyntaxErrorType::NoEndParenthesis, _currentPosition );
}
}
// NOT <clause>
else if ( isNextSubstring( charArrToStr( TEXT_TokenLogicalOperNOT )))
{
CToken* token = new CToken( LogicalOper::Not, position );
tokens.push_back( token );
skipWhiteChars();
parseClause( tokens );
}
// <term>
else
{
parseTerm( tokens );
}
}
//
// Parses a term:
//
// term ::= <parameter_name> <relation> <value>
// <parameter_name> LIKE <string>
// <parameter_name> IN {<value_set>}
// <parameter_name> <relation> <parameter_name>
// {functions on term level}
//
void ConstraintsTokenizer::parseTerm( IN OUT CTokenList& tokens )
{
skipWhiteChars();
wstring::iterator position = _currentPosition;
// check whether it's one of the functions
CFunction *function = getFunction();
if( nullptr != function )
{
CToken* token;
try
{
token = new CToken( function, position );
}
catch( ... )
{
delete( function );
throw;
}
tokens.push_back( token );
}
// if not, parse anything that starts with para_name
else
{
wstring paramName = getParameterName();
CParameters::iterator found = _model.findParamByName( paramName );
CParameter* param = nullptr;
if ( found != _model.Parameters.end() )
{
param = &*found;
}
skipWhiteChars();
RelationType relationType = getRelationType();
skipWhiteChars();
CTerm* term = nullptr;
switch( relationType )
{
case RelationType::In:
case RelationType::NotIn:
{
CValueSet* valueSet = new CValueSet;
if ( ! isNextSubstring( charArrToStr( TEXT_TokenValueSetOpen )))
{
throw CSyntaxError( SyntaxErrorType::NoValueSetOpen, _currentPosition );
}
try
{
getValueSet( *valueSet );
}
catch( ... )
{
delete( valueSet );
throw;
}
skipWhiteChars();
if ( ! isNextSubstring( charArrToStr( TEXT_TokenValueSetClose )))
{
throw CSyntaxError( SyntaxErrorType::NoValueSetClose, _currentPosition );
}
// raw text of a term
wstring rawText;
rawText.assign( position, _currentPosition );
try
{
term = new CTerm( param, relationType, TermDataType::ValueSet, valueSet, rawText );
}
catch( ... )
{
delete( valueSet );
throw;
}
break;
}
// At this point the relation LIKE is treated as an ordinary relation
// despite the fact it can only have a string as an argument on
// the right-side. It will be verified later during parsing.
default:
{
if ( isNextSubstring( charArrToStr( TEXT_TokenParameterNameOpen ), true ))
{
wstring paramName2 = getParameterName();
//
// look up parameters by their names and return references
//
CParameter *param2 = nullptr;
found = _model.findParamByName( paramName2 );
if ( found != _model.Parameters.end() )
{
param2 = &*found;
}
wstring rawText;
rawText.assign( position, _currentPosition );
term = new CTerm( param, relationType, TermDataType::ParameterName, param2, rawText );
}
else
{
CValue* value = getValue();
// raw text of a term
wstring rawText;
rawText.assign( position, _currentPosition );
try
{
term = new CTerm( param, relationType, TermDataType::Value, value, rawText );
}
catch( ... )
{
delete( value );
throw;
}
}
break;
}
}
// now create token of type 'term'; this token has data
CToken* token;
try
{
token = new CToken( term, position );
}
catch( ... )
{
delete( term );
throw;
}
tokens.push_back( token );
}
}
//
// Parses a function
//
// <term> ::= IsNegative(<parameter_name>)
//
// Returns a CFunction object if in fact a function was parsed
// or "nullptr" otherwise
//
CFunction *ConstraintsTokenizer::getFunction()
{
skipWhiteChars();
wstring::iterator position = _currentPosition;
FunctionType type = FunctionType::Unknown;
if ( isNextSubstring( charArrToStr( TEXT_FunctionIsNegativeParam )))
{
type = FunctionType::IsNegativeParam;
}
else if ( isNextSubstring( charArrToStr( TEXT_FunctionIsPositiveParam )))
{
type = FunctionType::IsPositiveParam;
}
else
{
return nullptr;
}
// opening bracket
if ( ! isNextSubstring( charArrToStr( TEXT_TokenParenthesisOpen )))
{
throw CSyntaxError( SyntaxErrorType::FunctionNoParenthesisOpen, _currentPosition );
}
// get the parameter name
skipWhiteChars();
wstring paramName = getString( charArrToStr( TEXT_TokenParenthesisClose ));
CParameters::iterator found = _model.findParamByName( paramName );
CParameter* param = nullptr;
if ( found != _model.Parameters.end() )
{
param = &*found;
}
if ( ! isNextSubstring( charArrToStr( TEXT_TokenParenthesisClose )))
{
throw CSyntaxError( SyntaxErrorType::FunctionNoParenthesisClose, _currentPosition );
}
// now create a CFunction and return it
wstring rawText;
rawText.assign( position, _currentPosition );
CFunction* function = new CFunction( type, FunctionDataType::Parameter, param, paramName, rawText );
return( function );
}
//
// Returns a CValue.
//
// Note: allocates memory, caller is supposed to free it
//
CValue* ConstraintsTokenizer::getValue()
{
CValue* value;
// value is either string or number,
// a string always begins with quotes so check for it first
if ( isNextSubstring( charArrToStr( TEXT_TokenQuotes )))
{
wstring text;
text = getString( charArrToStr( TEXT_TokenQuotes ));
if (! isNextSubstring( charArrToStr( TEXT_TokenQuotes )))
{
throw CSyntaxError( SyntaxErrorType::UnexpectedEndOfString, _currentPosition );
}
value = new CValue( text );
}
else
{
double number = getNumber();
value = new CValue( number );
}
return( value );
}
//
// Parses a valueset
//
// value_set ::= <value>
// <value>,<value_set>
//
void ConstraintsTokenizer::getValueSet( OUT CValueSet& valueSet )
{
skipWhiteChars();
CValue* value = getValue();
valueSet.push_back( *value );
delete( value );
skipWhiteChars();
if ( isNextSubstring( charArrToStr( TEXT_TokenValueSetSeparator )))
{
skipWhiteChars();
getValueSet( valueSet );
}
}
//
// Returns the next relation; order of comparisons is important
//
RelationType ConstraintsTokenizer::getRelationType()
{
if ( isNextSubstring( charArrToStr( TEXT_TokenRelationEQ ))) return ( RelationType::Eq );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationNE ))) return ( RelationType::Ne );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationLE ))) return ( RelationType::Le );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationGE ))) return ( RelationType::Ge );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationGT ))) return ( RelationType::Gt );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationLT ))) return ( RelationType::Lt );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationIN ))) return ( RelationType::In );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationLIKE ))) return ( RelationType::Like );
else if( isNextSubstring( charArrToStr( TEXT_TokenLogicalOperNOT )))
{
skipWhiteChars();
if ( isNextSubstring( charArrToStr( TEXT_TokenRelationIN ))) return ( RelationType::NotIn );
else if( isNextSubstring( charArrToStr( TEXT_TokenRelationLIKE ))) return ( RelationType::NotLike );
else throw CSyntaxError( SyntaxErrorType::UnknownRelation, _currentPosition );
}
else throw CSyntaxError( SyntaxErrorType::UnknownRelation, _currentPosition );
}
//
// Returns the next logical operator; doesn't handle NOT as it's parsed directly.
//
LogicalOper ConstraintsTokenizer::getLogicalOper()
{
if ( isNextSubstring( charArrToStr( TEXT_TokenLogicalOperAND ))) return ( LogicalOper::And );
else if ( isNextSubstring( charArrToStr( TEXT_TokenLogicalOperOR ))) return ( LogicalOper::Or );
else return ( LogicalOper::Unknown );
}
//
// Parses parameter name
//
wstring ConstraintsTokenizer::getParameterName()
{
wstring name;
// look for opening marker
if ( ! ( isNextSubstring( charArrToStr( TEXT_TokenParameterNameOpen ))))
{
throw CSyntaxError( SyntaxErrorType::NoParameterNameOpen, _currentPosition );
}
// retrive text
name = getString( charArrToStr( TEXT_TokenParameterNameClose ));
// look for closing marker
if ( ! isNextSubstring( charArrToStr( TEXT_TokenParameterNameClose )))
{
throw CSyntaxError( SyntaxErrorType::NoParameterNameClose, _currentPosition );
}
return( name );
}
//
// Returns a number; reads from a string stream.
//
double ConstraintsTokenizer::getNumber()
{
// declare new stream from text we'd like to parse
// then try to get numeric value preserving old and new
// position within a stream to properly update cursor
wstring substring( _currentPosition, _constraintsText.end() );
wistringstream ist( substring );
unsigned int positionBefore = (unsigned int) ist.tellg();
double number;
ist>>number;
if (ist.rdstate() & ios::failbit)
{
throw CSyntaxError( SyntaxErrorType::NotNumericValue, _currentPosition );
}
// success, update current cursor position
unsigned int difference = (unsigned int) ist.tellg() - positionBefore;
_currentPosition += difference;
return ( number );
}
//
// Reads next characters considering them part of string
// Terminator is the enclosing char, typically a "
//
wstring ConstraintsTokenizer::getString( IN const wstring& terminator )
{
wstring ret;
assert( 1 == terminator.size() );
wchar_t terminatingChar = terminator[ 0 ];
wchar_t readChar;
while( true )
{
// get next character, function throws error when there are no chars left
readChar = peekNextChar();
// string ends properly terminated
if ( terminatingChar == readChar )
{
movePosition( -1 );
break;
}
// handle special characters
else if ( TEXT_SpecialCharMarker == readChar )
{
wchar_t nextChar = peekNextChar();
bool found = false;
for( auto specialChar : SpecialCharacters )
{
if( nextChar == specialChar ) found = true;
}
if( !found ) throw CSyntaxError( SyntaxErrorType::UnknownSpecialChar, _currentPosition );
// found a special character; append to resulting string in literal form
ret += nextChar;
}
// regular character: append to resulting string
else
{
ret += readChar;
}
}
return( ret );
}
//
// Skips all whitespace characters on and after current position.
//
void ConstraintsTokenizer::skipWhiteChars()
{
// probe next character; the function throws error when there are no chars left),
try
{
while ( true )
{
wchar_t nextChar = peekNextChar();
if ( ! ( iswspace ( nextChar ) // all white space characters
|| iswcntrl ( nextChar ))) // CRLF
{
movePosition( -1 );
break;
}
}
}
// there's nothing wrong with encountering the end of string here;
// other errors should be thrown to callers
catch ( CSyntaxError e )
{
if ( SyntaxErrorType::UnexpectedEndOfString != e.Type )
{
throw e;
}
}
}
//
// Returns the next character updating the current position
// Throws when no more characters are left
//
wchar_t ConstraintsTokenizer::peekNextChar()
{
if ( _currentPosition >= _constraintsText.end() )
{
throw CSyntaxError( SyntaxErrorType::UnexpectedEndOfString, _currentPosition );
}
return( *( _currentPosition++ ) );
}
//
// If texts match, returns True and also updates the current cursor position
// (unless explicitly requested not to)
//
bool ConstraintsTokenizer::isNextSubstring( IN const wstring& text, IN bool dontMoveCursor )
{
skipWhiteChars();
// Some STL implementations throw when text2 passed to 'equal' is shorter than text1.
// Checking for the sizes first should help.
bool textsMatch = false;
if( distance( _currentPosition, _constraintsText.end() ) >= (int) text.size() )
{
textsMatch = equal ( text.begin(), text.end(), _currentPosition,
[]( wchar_t c1, wchar_t c2 ) { return ( toupper( c1 ) == toupper( c2 ) ); }
);
}
if ( textsMatch && ! dontMoveCursor )
{
_currentPosition += text.length();
}
return ( textsMatch );
}
//
//
//
void ConstraintsTokenizer::movePosition( IN int count )
{
wstring::iterator newPosition = _currentPosition + count;
if ( newPosition < _constraintsText.begin() )
{
newPosition = _constraintsText.begin();
}
else if ( newPosition >= _constraintsText.end() )
{
newPosition = _constraintsText.end();
}
_currentPosition = newPosition;
}
//
// Expands "macros", there are two macros curently:
// IsNegative() == ( IsNegative(p1) or IsNegative(p2) or ... )
// IsPositive() == ( IsPositive(p1) and IsPositive(p2) and ... )
//
void ConstraintsTokenizer::doPostParseExpansions( IN OUT CTokenList& tokens )
{
CTokenList::iterator i_token = tokens.begin();
while( i_token != tokens.end() )
{
switch( (*i_token)->Type )
{
case TokenType::Function:
{
CFunction *function = (CFunction*) (*i_token)->Function;
if(( function->Type == FunctionType::IsNegativeParam
|| function->Type == FunctionType::IsPositiveParam )
&& function->DataText.empty() )
{
// deallocate the current token
// we don't have to deallocate Data because in this case it is always "nullptr"
assert( function->Data == nullptr );
// save positionInText and rawText and reuse it in all new tokens
wstring::iterator oldPosInText = (*i_token)->PositionInText;
FunctionType oldType = function->Type;
wstring oldRawText = function->RawText;
delete(*i_token);
i_token = tokens.erase( i_token );
// (
CToken* newToken = new CToken( TokenType::ParenthesisOpen, oldPosInText );
tokens.insert( i_token, newToken );
for( CParameters::iterator i_param = _model.Parameters.begin();
i_param != _model.Parameters.end();
++i_param )
{
if ( i_param->ResultParam ) continue;
if( i_param != _model.Parameters.begin() )
{
// logical operator OR or AND
newToken = new CToken( oldType == FunctionType::IsNegativeParam ? LogicalOper::Or : LogicalOper::And,
oldPosInText );
tokens.insert( i_token, newToken );
}
// IsNegative(param) / IsPositive(param)
CFunction* newFunction = new CFunction( oldType, FunctionDataType::Parameter,
&*i_param, i_param->Name, oldRawText );
newToken = new CToken( newFunction, oldPosInText );
tokens.insert( i_token, newToken );
}
// )
newToken = new CToken( TokenType::ParenthesisClose, oldPosInText );
tokens.insert( i_token, newToken );
}
else // it's not IsNegative() or IsPositive()
{
++i_token;
}
break;
}
default:
{
++i_token;
break;
}
}
}
}
}