gecko-dev/extensions/transformiix/source/xpath/ExprLexer.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is TransforMiiX XSLT processor.
 *
 * The Initial Developer of the Original Code is The MITRE Corporation.
 * Portions created by MITRE are Copyright (C) 1999 The MITRE Corporation.
 *
 * Portions created by Keith Visco as a Non MITRE employee,
 * (C) 1999 Keith Visco. All Rights Reserved.
 *
 * Contributor(s):
 * Keith Visco, kvisco@ziplink.net
 *   -- original author.
 *   -- fixed bug with '<=' and '>=' reported by Bob Miller
 *
 * Bob Miller, Oblix Inc., kbob@oblix.com
 *   -- fixed bug with single quotes inside double quotes
 *
 * Marina Mechtcheriakova, mmarina@mindspring.com
 *   -- Fixed bug in parse method so that we make sure we check for
 *      axis identifier wild cards, such as ancestor::*
 *
 * Axel Hecht <axel@pike.org>
 *   -- big beating, general overhaul
 *
 */

/**
 * Lexical analyzer for XPath expressions
**/

#include "ExprLexer.h"
#include "XMLUtils.h"

  //---------------------------/
 //- Implementation of Token -/
//---------------------------/


/**
 * Default constructor for Token
**/
Token::Token()
{
  this->type =0;
} //-- Token;

/**
 * Constructor for Token
 * @param type, the type of Token being represented
**/
Token::Token(short type)
{
  this->type = type;
} //-- Token;

/**
 *  Constructor for Token
 * @param value the value of this Token
 * @param type, the type of Token being represented
**/
Token::Token(const String& value, short type)
{
  this->type = type;
  //-- make copy of value String
  this->value = value;
} //-- Token

Token::Token(PRUnichar uniChar, short type)
{
  this->type = type;
  this->value.Append(uniChar);
} //-- Token

/**
 * Copy Constructor
**/
Token::Token(const Token& token)
{
  this->type = token.type;
  this->value = token.value;
} //-- Token

/**
 * Destructor for Token
**/
Token::~Token()
{
  //-- currently nothing is needed
} //-- ~Token


  //--------------------------------/
 //- Implementation of ExprLexer -/
//-------------------------------/

/*
 * Complex Tokens
*/
//-- Nodetype tokens
const String ExprLexer::COMMENT(NS_LITERAL_STRING("comment"));
const String ExprLexer::NODE(NS_LITERAL_STRING("node"));
const String ExprLexer::PROC_INST(NS_LITERAL_STRING("processing-instruction"));
const String ExprLexer::TEXT(NS_LITERAL_STRING("text"));

//-- boolean
const String ExprLexer::AND(NS_LITERAL_STRING("and"));
const String ExprLexer::OR(NS_LITERAL_STRING("or"));

//-- multiplicative operators
const String ExprLexer::MODULUS(NS_LITERAL_STRING("mod"));
const String ExprLexer::DIVIDE(NS_LITERAL_STRING("div"));

/**
 * The set of Lexer error messages
 **/
const String ExprLexer::error_message[] =
{
  String(NS_LITERAL_STRING("VariableReference expected")),
  String(NS_LITERAL_STRING("Operator expected")),
  String(NS_LITERAL_STRING("Literal is not closed")),
  String(NS_LITERAL_STRING(": not expected")),
  String(NS_LITERAL_STRING("! not expected, use != or not()")),
  String(NS_LITERAL_STRING("found a unkown character"))
};

  //---------------/
 //- Contructors -/
//---------------/

/**
 * Creates a new ExprLexer using the given String
**/
ExprLexer::ExprLexer(const String& pattern)
{
  firstItem    = 0;
  lastItem     = 0;
  tokenCount   = 0;
  prevToken    = 0;
  endToken.type = Token::END;
  parse(pattern);
  currentItem = firstItem;
} //-- ExprLexer

/**
 * Destroys this instance of an ExprLexer
**/
ExprLexer::~ExprLexer()
{
  //-- delete tokens
  currentItem = firstItem;
  while (currentItem) {
    TokenListItem* temp = currentItem->next;
    delete currentItem->token;
    delete currentItem;
    currentItem = temp;
  }
} //-- ~ExprLexer


MBool ExprLexer::hasMoreTokens()
{
  return (currentItem != 0);
} //-- hasMoreTokens

Token* ExprLexer::nextToken()
{
  if (currentItem) {
    Token* token = currentItem->token;
    currentItem = currentItem->next;
    return token;
  }
  return &endToken;
} //-- nextToken

void ExprLexer::pushBack()
{
  if (!currentItem)
    currentItem = lastItem;
  else
    currentItem = currentItem->previous;
} //-- pushBack

Token* ExprLexer::peek()
{
  if (currentItem)
    return currentItem->token;
  return &endToken;
} //-- peek

void ExprLexer::addToken(Token* token)
{
  TokenListItem* tlItem = new TokenListItem;
  tlItem->token = token;
  tlItem->next  = 0;
  if (lastItem) {
    tlItem->previous = lastItem;
    lastItem->next = tlItem;
  }
  if (!firstItem)
    firstItem = tlItem;
  lastItem = tlItem;
  prevToken = token;
  ++tokenCount;
} //-- addToken

/**
 * Returns true if the following Token should be an operator.
 * This is a helper for the first bullet of [XPath 3.7]
 *  Lexical Structure
**/
MBool ExprLexer::nextIsOperatorToken(Token* token)
{
  if (!token || token->type == Token::NULL_TOKEN)
    return MB_FALSE;
  /* This relies on the tokens having the right order in ExprLexer.h */
  if (token->type >= Token::COMMA &&
      token->type <= Token::UNION_OP)
    return MB_FALSE;
  return MB_TRUE;
} //-- nextIsOperatorToken

/**
 *  Parses the given String into the set of Tokens
**/
void ExprLexer::parse(const String& pattern)
{
  if (pattern.IsEmpty())
    return;

  String tokenBuffer;
  PRUint32 iter = 0, start;
  PRUint32 size = pattern.Length();
  short defType;
  PRUnichar ch;

  //-- initialize previous token, this will automatically get
  //-- deleted when it goes out of scope
  Token nullToken('\0', Token::NULL_TOKEN);

  prevToken = &nullToken;

  while (iter < size) {

    ch = pattern.CharAt(iter);
    defType = Token::CNAME;

    if (ch==DOLLAR_SIGN) {
      if (++iter == size || !XMLUtils::isLetter(ch=pattern.CharAt(iter))) {
        // Error, VariableReference expected
        errorPos = iter;
        errorCode = ERROR_UNRESOLVED_VAR_REFERENCE;
        if (firstItem)
          firstItem->token->type=Token::ERROR;
        else
          addToken(new Token('\0',Token::ERROR));
        iter=size; // bail
      }
      else
        defType = Token::VAR_REFERENCE;
    }
    // just reuse the QName parsing, which will use defType
    // the token to construct

    if (XMLUtils::isLetter(ch)) {
      // NCName, can get QName or OperatorName;
      //  FunctionName, NodeName, and AxisSpecifier may want whitespace,
      //  and are dealt with below
      start = iter;
      while (++iter < size &&
             XMLUtils::isNCNameChar(pattern.CharAt(iter))) /* just go */ ;
      if (iter < size && pattern.CharAt(iter)==COLON) {
        // try QName or wildcard, might need to step back for axis
        if (++iter < size)
          if (XMLUtils::isLetter(pattern.CharAt(iter)))
            while (++iter < size &&
                   XMLUtils::isNCNameChar(pattern.CharAt(iter))) /* just go */ ;
          else if (pattern.CharAt(iter)=='*'
                   && defType != Token::VAR_REFERENCE)
            ++iter; /* eat wildcard for NameTest, bail for var ref at COLON */
          else
            iter--; // step back
      }
      if (nextIsOperatorToken(prevToken)) {
        if (pattern.subString(start,iter,subStr).Equals(AND))
          defType = Token::AND_OP;
        else if (pattern.subString(start,iter,subStr).Equals(OR))
          defType = Token::OR_OP;
        else if (pattern.subString(start,iter,subStr).Equals(MODULUS))
          defType = Token::MODULUS_OP;
        else if (pattern.subString(start,iter,subStr).Equals(DIVIDE))
          defType = Token::DIVIDE_OP;
        else {
          // Error "operator expected"
          // XXX QUESTION: spec is not too precise
          // badops is sure an error, but is bad:ops, too? We say yes!
          errorPos = iter;
          errorCode = ERROR_OP_EXPECTED;
          if (firstItem)
            firstItem->token->type=Token::ERROR;
          else
            addToken(new Token('\0',Token::ERROR));
          iter=size; // bail
        }
      }
      addToken(new Token(pattern.subString(start,iter,subStr),defType));
    }
    else if (isXPathDigit(ch)) {
      start = iter;
      while (++iter < size &&
             isXPathDigit(pattern.CharAt(iter))) /* just go */;
      if (iter < size && pattern.CharAt(iter) == '.')
        while (++iter < size &&
               isXPathDigit(pattern.CharAt(iter))) /* just go */;
      addToken(new Token(pattern.subString(start,iter,subStr),Token::NUMBER));
    }
    else {
      switch (ch) {
        //-- ignore whitespace
      case SPACE:
      case TX_TAB:
      case TX_CR:
      case TX_LF:
        ++iter;
        break;
      case S_QUOTE :
      case D_QUOTE :
        start=iter;
        iter = pattern.indexOf(ch, (PRInt32)start + 1);
        if ((PRInt32)iter == kNotFound) {
          // XXX Error reporting "unclosed literal"
          errorPos = start;
          errorCode = ERROR_UNCLOSED_LITERAL;
          if (firstItem)
            firstItem->token->type=Token::ERROR;
          else
            addToken(new Token('\0',Token::ERROR));
          iter=size; // bail
        }
        else {
          addToken(new Token(pattern.subString(start+1,iter,subStr),
                             Token::LITERAL));
          ++iter;
        }
        break;
      case PERIOD:
        // period can be .., .(DIGITS)+ or ., check next
        if (++iter < size) {
          ch=pattern.CharAt(iter);
          if (isXPathDigit(ch)) {
            start=iter-1;
            while (++iter < size &&
                   isXPathDigit(pattern.CharAt(iter))) /* just go */;
            addToken(new Token(pattern.subString(start,iter,subStr),
                               Token::NUMBER));
          }
          else if (ch==PERIOD) {
            addToken(new Token(pattern.subString(iter-1,iter++,subStr),
                               Token::PARENT_NODE));
          }
          else
            addToken(new Token(PERIOD, Token::SELF_NODE));
        }
        else
          addToken(new Token(ch, Token::SELF_NODE));
        // iter++ is already in the number test

        break;
      case COLON: // QNames are dealt above, must be axis ident
        if (++iter < size && pattern.CharAt(iter)==COLON &&
            prevToken->type == Token::CNAME) {
          prevToken->type = Token::AXIS_IDENTIFIER;
          ++iter;
        }
        else {
          // XXX Error report "colon is neither QName nor axis"
          errorPos = iter;
          errorCode = ERROR_COLON;
          if (firstItem)
            firstItem->token->type=Token::ERROR;
          else
            addToken(new Token('\0',Token::ERROR));
          iter=size; // bail
        }
        break;
      case FORWARD_SLASH :
        if (++iter < size && pattern.CharAt(iter)==ch) {
          addToken(new Token(pattern.subString(iter-1,++iter,subStr),
                             Token::ANCESTOR_OP));
        }
        else {
          addToken(new Token(ch, Token::PARENT_OP));
        }
        break;
      case BANG : // can only be !=
        if (++iter < size && pattern.CharAt(iter)==EQUAL) {
          addToken(new Token(pattern.subString(iter-1,++iter,subStr),
                             Token::NOT_EQUAL_OP));
        }
        else {
          // Error ! is not not()
          errorPos = iter;
          errorCode = ERROR_BANG;
          if (firstItem)
            firstItem->token->type=Token::ERROR;
          else
            addToken(new Token('\0',Token::ERROR));
          iter=size; // bail
        }
        break;
      case EQUAL:
        addToken(new Token(ch,Token::EQUAL_OP));
        ++iter;
        break;
      case L_ANGLE:
        if (++iter < size && pattern.CharAt(iter)==EQUAL) {
          addToken(new Token(pattern.subString(iter-1,++iter,subStr),
                             Token::LESS_OR_EQUAL_OP));
        }
        else
          addToken(new Token(ch,Token::LESS_THAN_OP));
        break;
      case R_ANGLE:
        if (++iter < size && pattern.CharAt(iter)==EQUAL) {
          addToken(new Token(pattern.subString(iter-1,++iter,subStr),
                             Token::GREATER_OR_EQUAL_OP));
        }
        else
          addToken(new Token(ch,Token::GREATER_THAN_OP));
        break;
      case HYPHEN :
        addToken(new Token(ch,Token::SUBTRACTION_OP));
        ++iter;
        break;
      case ASTERIX:
        if (nextIsOperatorToken(prevToken))
          addToken(new Token(ch,Token::MULTIPLY_OP));
        else
          addToken(new Token(ch,Token::CNAME));
        ++iter;
        break;
      case L_PAREN:
        if (prevToken->type == Token::CNAME) {
          if (prevToken->value.Equals(COMMENT))
            prevToken->type = Token::COMMENT;
          else if (prevToken->value.Equals(NODE))
            prevToken->type = Token::NODE;
          else if (prevToken->value.Equals(PROC_INST))
            prevToken->type = Token::PROC_INST;
          else if (prevToken->value.Equals(TEXT))
            prevToken->type = Token::TEXT;
          else
            prevToken->type = Token::FUNCTION_NAME;
        }
        ++iter;
        addToken(new Token(ch,Token::L_PAREN));
        break;
      case R_PAREN:
        ++iter;
        addToken(new Token(ch,Token::R_PAREN));
        break;
      case L_BRACKET:
        ++iter;
        addToken(new Token(ch,Token::L_BRACKET));
        break;
      case R_BRACKET:
        ++iter;
        addToken(new Token(ch,Token::R_BRACKET));
        break;
      case COMMA:
        ++iter;
        addToken(new Token(ch,Token::COMMA));
        break;
      case AT_SIGN :
        ++iter;
        addToken(new Token(ch,Token::AT_SIGN));
        break;
      case PLUS:
        ++iter;
        addToken(new Token(ch,Token::ADDITION_OP));
        break;
      case VERT_BAR:
        ++iter;
        addToken(new Token(ch,Token::UNION_OP));
        break;
      default:
        // Error, don't grok character :-(
        errorPos = iter;
        errorCode = ERROR_UNKNOWN_CHAR;
        if (firstItem)
          firstItem->token->type=Token::ERROR;
        else
          addToken(new Token('\0',Token::ERROR));
        iter=size; // bail
      }
    }
  }
} //-- parse