gecko-dev/extensions/transformiix/source/xpath/ExprLexer.cpp

509 строки
14 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* The contents of this file are subject to the Mozilla Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is TransforMiiX XSLT processor.
*
* The Initial Developer of the Original Code is The MITRE Corporation.
* Portions created by MITRE are Copyright (C) 1999 The MITRE Corporation.
*
* Portions created by Keith Visco as a Non MITRE employee,
* (C) 1999 Keith Visco. All Rights Reserved.
*
* Contributor(s):
* Keith Visco, kvisco@ziplink.net
* -- original author.
* -- fixed bug with '<=' and '>=' reported by Bob Miller
*
* Bob Miller, Oblix Inc., kbob@oblix.com
* -- fixed bug with single quotes inside double quotes
*
* Marina Mechtcheriakova, mmarina@mindspring.com
* -- Fixed bug in parse method so that we make sure we check for
* axis identifier wild cards, such as ancestor::*
*
* Axel Hecht <axel@pike.org>
* -- big beating, general overhaul
*
*/
/**
* Lexical analyzer for XPath expressions
**/
#include "ExprLexer.h"
#include "XMLUtils.h"
//---------------------------/
//- Implementation of Token -/
//---------------------------/
/**
* Default constructor for Token
**/
Token::Token()
{
this->type =0;
} //-- Token;
/**
* Constructor for Token
* @param type, the type of Token being represented
**/
Token::Token(short type)
{
this->type = type;
} //-- Token;
/**
* Constructor for Token
* @param value the value of this Token
* @param type, the type of Token being represented
**/
Token::Token(const String& value, short type)
{
this->type = type;
//-- make copy of value String
this->value = value;
} //-- Token
Token::Token(PRUnichar uniChar, short type)
{
this->type = type;
this->value.Append(uniChar);
} //-- Token
/**
* Copy Constructor
**/
Token::Token(const Token& token)
{
this->type = token.type;
this->value = token.value;
} //-- Token
/**
* Destructor for Token
**/
Token::~Token()
{
//-- currently nothing is needed
} //-- ~Token
//--------------------------------/
//- Implementation of ExprLexer -/
//-------------------------------/
/*
* Complex Tokens
*/
//-- Nodetype tokens
const String ExprLexer::COMMENT(NS_LITERAL_STRING("comment"));
const String ExprLexer::NODE(NS_LITERAL_STRING("node"));
const String ExprLexer::PROC_INST(NS_LITERAL_STRING("processing-instruction"));
const String ExprLexer::TEXT(NS_LITERAL_STRING("text"));
//-- boolean
const String ExprLexer::AND(NS_LITERAL_STRING("and"));
const String ExprLexer::OR(NS_LITERAL_STRING("or"));
//-- multiplicative operators
const String ExprLexer::MODULUS(NS_LITERAL_STRING("mod"));
const String ExprLexer::DIVIDE(NS_LITERAL_STRING("div"));
/**
* The set of Lexer error messages
**/
const String ExprLexer::error_message[] =
{
String(NS_LITERAL_STRING("VariableReference expected")),
String(NS_LITERAL_STRING("Operator expected")),
String(NS_LITERAL_STRING("Literal is not closed")),
String(NS_LITERAL_STRING(": not expected")),
String(NS_LITERAL_STRING("! not expected, use != or not()")),
String(NS_LITERAL_STRING("found a unkown character"))
};
//---------------/
//- Contructors -/
//---------------/
/**
* Creates a new ExprLexer using the given String
**/
ExprLexer::ExprLexer(const String& pattern)
{
firstItem = 0;
lastItem = 0;
tokenCount = 0;
prevToken = 0;
endToken.type = Token::END;
parse(pattern);
currentItem = firstItem;
} //-- ExprLexer
/**
* Destroys this instance of an ExprLexer
**/
ExprLexer::~ExprLexer()
{
//-- delete tokens
currentItem = firstItem;
while (currentItem) {
TokenListItem* temp = currentItem->next;
delete currentItem->token;
delete currentItem;
currentItem = temp;
}
} //-- ~ExprLexer
MBool ExprLexer::hasMoreTokens()
{
return (currentItem != 0);
} //-- hasMoreTokens
Token* ExprLexer::nextToken()
{
if (currentItem) {
Token* token = currentItem->token;
currentItem = currentItem->next;
return token;
}
return &endToken;
} //-- nextToken
void ExprLexer::pushBack()
{
if (!currentItem)
currentItem = lastItem;
else
currentItem = currentItem->previous;
} //-- pushBack
Token* ExprLexer::peek()
{
if (currentItem)
return currentItem->token;
return &endToken;
} //-- peek
void ExprLexer::addToken(Token* token)
{
TokenListItem* tlItem = new TokenListItem;
tlItem->token = token;
tlItem->next = 0;
if (lastItem) {
tlItem->previous = lastItem;
lastItem->next = tlItem;
}
if (!firstItem)
firstItem = tlItem;
lastItem = tlItem;
prevToken = token;
++tokenCount;
} //-- addToken
/**
* Returns true if the following Token should be an operator.
* This is a helper for the first bullet of [XPath 3.7]
* Lexical Structure
**/
MBool ExprLexer::nextIsOperatorToken(Token* token)
{
if (!token || token->type == Token::NULL_TOKEN)
return MB_FALSE;
/* This relies on the tokens having the right order in ExprLexer.h */
if (token->type >= Token::COMMA &&
token->type <= Token::UNION_OP)
return MB_FALSE;
return MB_TRUE;
} //-- nextIsOperatorToken
/**
* Parses the given String into the set of Tokens
**/
void ExprLexer::parse(const String& pattern)
{
if (pattern.IsEmpty())
return;
String tokenBuffer;
PRUint32 iter = 0, start;
PRUint32 size = pattern.Length();
short defType;
PRUnichar ch;
//-- initialize previous token, this will automatically get
//-- deleted when it goes out of scope
Token nullToken('\0', Token::NULL_TOKEN);
prevToken = &nullToken;
while (iter < size) {
ch = pattern.CharAt(iter);
defType = Token::CNAME;
if (ch==DOLLAR_SIGN) {
if (++iter == size || !XMLUtils::isLetter(ch=pattern.CharAt(iter))) {
// Error, VariableReference expected
errorPos = iter;
errorCode = ERROR_UNRESOLVED_VAR_REFERENCE;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
else
defType = Token::VAR_REFERENCE;
}
// just reuse the QName parsing, which will use defType
// the token to construct
if (XMLUtils::isLetter(ch)) {
// NCName, can get QName or OperatorName;
// FunctionName, NodeName, and AxisSpecifier may want whitespace,
// and are dealt with below
start = iter;
while (++iter < size &&
XMLUtils::isNCNameChar(pattern.CharAt(iter))) /* just go */ ;
if (iter < size && pattern.CharAt(iter)==COLON) {
// try QName or wildcard, might need to step back for axis
if (++iter < size)
if (XMLUtils::isLetter(pattern.CharAt(iter)))
while (++iter < size &&
XMLUtils::isNCNameChar(pattern.CharAt(iter))) /* just go */ ;
else if (pattern.CharAt(iter)=='*'
&& defType != Token::VAR_REFERENCE)
++iter; /* eat wildcard for NameTest, bail for var ref at COLON */
else
iter--; // step back
}
if (nextIsOperatorToken(prevToken)) {
if (pattern.subString(start,iter,subStr).Equals(AND))
defType = Token::AND_OP;
else if (pattern.subString(start,iter,subStr).Equals(OR))
defType = Token::OR_OP;
else if (pattern.subString(start,iter,subStr).Equals(MODULUS))
defType = Token::MODULUS_OP;
else if (pattern.subString(start,iter,subStr).Equals(DIVIDE))
defType = Token::DIVIDE_OP;
else {
// Error "operator expected"
// XXX QUESTION: spec is not too precise
// badops is sure an error, but is bad:ops, too? We say yes!
errorPos = iter;
errorCode = ERROR_OP_EXPECTED;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
}
addToken(new Token(pattern.subString(start,iter,subStr),defType));
}
else if (isXPathDigit(ch)) {
start = iter;
while (++iter < size &&
isXPathDigit(pattern.CharAt(iter))) /* just go */;
if (iter < size && pattern.CharAt(iter) == '.')
while (++iter < size &&
isXPathDigit(pattern.CharAt(iter))) /* just go */;
addToken(new Token(pattern.subString(start,iter,subStr),Token::NUMBER));
}
else {
switch (ch) {
//-- ignore whitespace
case SPACE:
case TX_TAB:
case TX_CR:
case TX_LF:
++iter;
break;
case S_QUOTE :
case D_QUOTE :
start=iter;
iter = pattern.indexOf(ch, (PRInt32)start + 1);
if ((PRInt32)iter == kNotFound) {
// XXX Error reporting "unclosed literal"
errorPos = start;
errorCode = ERROR_UNCLOSED_LITERAL;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
else {
addToken(new Token(pattern.subString(start+1,iter,subStr),
Token::LITERAL));
++iter;
}
break;
case PERIOD:
// period can be .., .(DIGITS)+ or ., check next
if (++iter < size) {
ch=pattern.CharAt(iter);
if (isXPathDigit(ch)) {
start=iter-1;
while (++iter < size &&
isXPathDigit(pattern.CharAt(iter))) /* just go */;
addToken(new Token(pattern.subString(start,iter,subStr),
Token::NUMBER));
}
else if (ch==PERIOD) {
addToken(new Token(pattern.subString(iter-1,iter++,subStr),
Token::PARENT_NODE));
}
else
addToken(new Token(PERIOD, Token::SELF_NODE));
}
else
addToken(new Token(ch, Token::SELF_NODE));
// iter++ is already in the number test
break;
case COLON: // QNames are dealt above, must be axis ident
if (++iter < size && pattern.CharAt(iter)==COLON &&
prevToken->type == Token::CNAME) {
prevToken->type = Token::AXIS_IDENTIFIER;
++iter;
}
else {
// XXX Error report "colon is neither QName nor axis"
errorPos = iter;
errorCode = ERROR_COLON;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
break;
case FORWARD_SLASH :
if (++iter < size && pattern.CharAt(iter)==ch) {
addToken(new Token(pattern.subString(iter-1,++iter,subStr),
Token::ANCESTOR_OP));
}
else {
addToken(new Token(ch, Token::PARENT_OP));
}
break;
case BANG : // can only be !=
if (++iter < size && pattern.CharAt(iter)==EQUAL) {
addToken(new Token(pattern.subString(iter-1,++iter,subStr),
Token::NOT_EQUAL_OP));
}
else {
// Error ! is not not()
errorPos = iter;
errorCode = ERROR_BANG;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
break;
case EQUAL:
addToken(new Token(ch,Token::EQUAL_OP));
++iter;
break;
case L_ANGLE:
if (++iter < size && pattern.CharAt(iter)==EQUAL) {
addToken(new Token(pattern.subString(iter-1,++iter,subStr),
Token::LESS_OR_EQUAL_OP));
}
else
addToken(new Token(ch,Token::LESS_THAN_OP));
break;
case R_ANGLE:
if (++iter < size && pattern.CharAt(iter)==EQUAL) {
addToken(new Token(pattern.subString(iter-1,++iter,subStr),
Token::GREATER_OR_EQUAL_OP));
}
else
addToken(new Token(ch,Token::GREATER_THAN_OP));
break;
case HYPHEN :
addToken(new Token(ch,Token::SUBTRACTION_OP));
++iter;
break;
case ASTERIX:
if (nextIsOperatorToken(prevToken))
addToken(new Token(ch,Token::MULTIPLY_OP));
else
addToken(new Token(ch,Token::CNAME));
++iter;
break;
case L_PAREN:
if (prevToken->type == Token::CNAME) {
if (prevToken->value.Equals(COMMENT))
prevToken->type = Token::COMMENT;
else if (prevToken->value.Equals(NODE))
prevToken->type = Token::NODE;
else if (prevToken->value.Equals(PROC_INST))
prevToken->type = Token::PROC_INST;
else if (prevToken->value.Equals(TEXT))
prevToken->type = Token::TEXT;
else
prevToken->type = Token::FUNCTION_NAME;
}
++iter;
addToken(new Token(ch,Token::L_PAREN));
break;
case R_PAREN:
++iter;
addToken(new Token(ch,Token::R_PAREN));
break;
case L_BRACKET:
++iter;
addToken(new Token(ch,Token::L_BRACKET));
break;
case R_BRACKET:
++iter;
addToken(new Token(ch,Token::R_BRACKET));
break;
case COMMA:
++iter;
addToken(new Token(ch,Token::COMMA));
break;
case AT_SIGN :
++iter;
addToken(new Token(ch,Token::AT_SIGN));
break;
case PLUS:
++iter;
addToken(new Token(ch,Token::ADDITION_OP));
break;
case VERT_BAR:
++iter;
addToken(new Token(ch,Token::UNION_OP));
break;
default:
// Error, don't grok character :-(
errorPos = iter;
errorCode = ERROR_UNKNOWN_CHAR;
if (firstItem)
firstItem->token->type=Token::ERROR;
else
addToken(new Token('\0',Token::ERROR));
iter=size; // bail
}
}
}
} //-- parse