зеркало из https://github.com/mozilla/gecko-dev.git
525 строки
19 KiB
C++
525 строки
19 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#ifndef Tokenizer_h__
|
|
#define Tokenizer_h__
|
|
|
|
#include <type_traits>
|
|
|
|
#include "nsString.h"
|
|
#include "mozilla/CheckedInt.h"
|
|
#include "mozilla/ScopeExit.h"
|
|
#include "mozilla/UniquePtr.h"
|
|
#include "nsTArray.h"
|
|
|
|
namespace mozilla {
|
|
|
|
template <typename TChar>
|
|
class TokenizerBase {
|
|
public:
|
|
typedef nsTSubstring<TChar> TAString;
|
|
typedef nsTString<TChar> TString;
|
|
typedef nsTDependentString<TChar> TDependentString;
|
|
typedef nsTDependentSubstring<TChar> TDependentSubstring;
|
|
|
|
static TChar const sWhitespaces[];
|
|
|
|
/**
|
|
* The analyzer works with elements in the input cut to a sequence of token
|
|
* where each token has an elementary type
|
|
*/
|
|
enum TokenType : uint32_t {
|
|
TOKEN_UNKNOWN,
|
|
TOKEN_RAW,
|
|
TOKEN_ERROR,
|
|
TOKEN_INTEGER,
|
|
TOKEN_WORD,
|
|
TOKEN_CHAR,
|
|
TOKEN_WS,
|
|
TOKEN_EOL,
|
|
TOKEN_EOF,
|
|
TOKEN_CUSTOM0 = 1000
|
|
};
|
|
|
|
enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE };
|
|
|
|
/**
|
|
* Class holding the type and the value of a token. It can be manually
|
|
* created to allow checks against it via methods of TTokenizer or are results
|
|
* of some of the TTokenizer's methods.
|
|
*/
|
|
class Token {
|
|
TokenType mType;
|
|
TDependentSubstring mWord;
|
|
TString mCustom;
|
|
TChar mChar;
|
|
uint64_t mInteger;
|
|
ECaseSensitivity mCustomCaseInsensitivity;
|
|
bool mCustomEnabled;
|
|
|
|
// If this token is a result of the parsing process, this member is
|
|
// referencing a sub-string in the input buffer. If this is externally
|
|
// created Token this member is left an empty string.
|
|
TDependentSubstring mFragment;
|
|
|
|
friend class TokenizerBase<TChar>;
|
|
void AssignFragment(typename TAString::const_char_iterator begin,
|
|
typename TAString::const_char_iterator end);
|
|
|
|
static Token Raw();
|
|
|
|
public:
|
|
Token();
|
|
Token(const Token& aOther);
|
|
Token& operator=(const Token& aOther);
|
|
|
|
// Static constructors of tokens by type and value
|
|
static Token Word(TAString const& aWord);
|
|
static Token Char(TChar const aChar);
|
|
static Token Number(uint64_t const aNumber);
|
|
static Token Whitespace();
|
|
static Token NewLine();
|
|
static Token EndOfFile();
|
|
static Token Error();
|
|
|
|
// Compares the two tokens, type must be identical and value
|
|
// of one of the tokens must be 'any' or equal.
|
|
bool Equals(const Token& aOther) const;
|
|
|
|
TokenType Type() const { return mType; }
|
|
TChar AsChar() const;
|
|
TDependentSubstring AsString() const;
|
|
uint64_t AsInteger() const;
|
|
|
|
TDependentSubstring Fragment() const { return mFragment; }
|
|
};
|
|
|
|
/**
|
|
* Consumers may register a custom string that, when found in the input, is
|
|
* considered a token and returned by Next*() and accepted by Check*()
|
|
* methods. AddCustomToken() returns a reference to a token that can then be
|
|
* comapred using Token::Equals() againts the output from Next*() or be passed
|
|
* to Check*().
|
|
*/
|
|
Token AddCustomToken(const TAString& aValue,
|
|
ECaseSensitivity aCaseInsensitivity,
|
|
bool aEnabled = true);
|
|
template <uint32_t N>
|
|
Token AddCustomToken(const TChar (&aValue)[N],
|
|
ECaseSensitivity aCaseInsensitivity,
|
|
bool aEnabled = true) {
|
|
return AddCustomToken(TDependentSubstring(aValue, N - 1),
|
|
aCaseInsensitivity, aEnabled);
|
|
}
|
|
void RemoveCustomToken(Token& aToken);
|
|
/**
|
|
* Only applies to a custom type of a Token (see AddCustomToken above.)
|
|
* This turns on and off token recognition. When a custom token is disabled,
|
|
* it's ignored as never added as a custom token.
|
|
*/
|
|
void EnableCustomToken(Token const& aToken, bool aEnable);
|
|
|
|
/**
|
|
* Mode of tokenization.
|
|
* FULL tokenization, the default, recognizes built-in tokens and any custom
|
|
* tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest
|
|
* is seen as 'raw'. This mode can be understood as a 'binary' mode.
|
|
*/
|
|
enum class Mode { FULL, CUSTOM_ONLY };
|
|
void SetTokenizingMode(Mode aMode);
|
|
|
|
/**
|
|
* Return false iff the last Check*() call has returned false or when we've
|
|
* read past the end of the input string.
|
|
*/
|
|
[[nodiscard]] bool HasFailed() const;
|
|
|
|
protected:
|
|
explicit TokenizerBase(const TChar* aWhitespaces = nullptr,
|
|
const TChar* aAdditionalWordChars = nullptr);
|
|
|
|
// false if we have already read the EOF token.
|
|
bool HasInput() const;
|
|
// Main parsing function, it doesn't shift the read cursor, just returns the
|
|
// next token position.
|
|
typename TAString::const_char_iterator Parse(Token& aToken) const;
|
|
// Is read cursor at the end?
|
|
bool IsEnd(const typename TAString::const_char_iterator& caret) const;
|
|
// True, when we are at the end of the input data, but it has not been marked
|
|
// as complete yet. In that case we cannot proceed with providing a
|
|
// multi-TChar token.
|
|
bool IsPending(const typename TAString::const_char_iterator& caret) const;
|
|
// Is read cursor on a character that is a word start?
|
|
bool IsWordFirst(const TChar aInput) const;
|
|
// Is read cursor on a character that is an in-word letter?
|
|
bool IsWord(const TChar aInput) const;
|
|
// Is read cursor on a character that is a valid number?
|
|
// TODO - support multiple radix
|
|
bool IsNumber(const TChar aInput) const;
|
|
// Is equal to the given custom token?
|
|
bool IsCustom(const typename TAString::const_char_iterator& caret,
|
|
const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
|
|
|
|
// Friendly helper to assign a fragment on a Token
|
|
static void AssignFragment(Token& aToken,
|
|
typename TAString::const_char_iterator begin,
|
|
typename TAString::const_char_iterator end);
|
|
|
|
#ifdef DEBUG
|
|
// This is called from inside Tokenizer methods to make sure the token is
|
|
// valid.
|
|
void Validate(Token const& aToken);
|
|
#endif
|
|
|
|
// true iff we have already read the EOF token
|
|
bool mPastEof;
|
|
// true iff the last Check*() call has returned false, reverts to true on
|
|
// Rollback() call
|
|
bool mHasFailed;
|
|
// true if the input string is final (finished), false when we expect more
|
|
// data yet to be fed to the tokenizer (see IncrementalTokenizer derived
|
|
// class).
|
|
bool mInputFinished;
|
|
// custom only vs full tokenizing mode, see the Parse() method
|
|
Mode mMode;
|
|
// minimal raw data chunked delivery during incremental feed
|
|
uint32_t mMinRawDelivery;
|
|
|
|
// Customizable list of whitespaces
|
|
const TChar* mWhitespaces;
|
|
// Additinal custom word characters
|
|
const TChar* mAdditionalWordChars;
|
|
|
|
// All these point to the original buffer passed to the constructor or to the
|
|
// incremental buffer after FeedInput.
|
|
typename TAString::const_char_iterator
|
|
mCursor; // Position of the current (actually next to read) token start
|
|
typename TAString::const_char_iterator mEnd; // End of the input position
|
|
|
|
// This is the list of tokens user has registered with AddCustomToken()
|
|
nsTArray<UniquePtr<Token>> mCustomTokens;
|
|
uint32_t mNextCustomTokenID;
|
|
|
|
private:
|
|
TokenizerBase() = delete;
|
|
TokenizerBase(const TokenizerBase&) = delete;
|
|
TokenizerBase(TokenizerBase&&) = delete;
|
|
TokenizerBase(const TokenizerBase&&) = delete;
|
|
TokenizerBase& operator=(const TokenizerBase&) = delete;
|
|
};
|
|
|
|
/**
|
|
* This is a simple implementation of a lexical analyzer or maybe better
|
|
* called a tokenizer.
|
|
*
|
|
* Please use Tokenizer or Tokenizer16 classes, that are specializations
|
|
* of this template class. Tokenizer is for ASCII input, Tokenizer16 may
|
|
* handle char16_t input, but doesn't recognize whitespaces or numbers
|
|
* other than standard `char` specialized Tokenizer class.
|
|
*/
|
|
template <typename TChar>
|
|
class TTokenizer : public TokenizerBase<TChar> {
|
|
public:
|
|
typedef TokenizerBase<TChar> base;
|
|
|
|
/**
|
|
* @param aSource
|
|
* The string to parse.
|
|
* IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer
|
|
* lifetime. It's up to the consumer to make sure the string's buffer outlives
|
|
* the TTokenizer!
|
|
* @param aWhitespaces
|
|
* If non-null TTokenizer will use this custom set of whitespaces for
|
|
* CheckWhite() and SkipWhites() calls. By default the list consists of space
|
|
* and tab.
|
|
* @param aAdditionalWordChars
|
|
* If non-null it will be added to the list of characters that consist a
|
|
* word. This is useful when you want to accept e.g. '-' in HTTP headers. By
|
|
* default a word character is consider any character for which upper case
|
|
* is different from lower case.
|
|
*
|
|
* If there is an overlap between aWhitespaces and aAdditionalWordChars, the
|
|
* check for word characters is made first.
|
|
*/
|
|
explicit TTokenizer(const typename base::TAString& aSource,
|
|
const TChar* aWhitespaces = nullptr,
|
|
const TChar* aAdditionalWordChars = nullptr);
|
|
explicit TTokenizer(const TChar* aSource, const TChar* aWhitespaces = nullptr,
|
|
const TChar* aAdditionalWordChars = nullptr);
|
|
|
|
/**
|
|
* When there is still anything to read from the input, tokenize it, store the
|
|
* token type and value to aToken result and shift the cursor past this just
|
|
* parsed token. Each call to Next() reads another token from the input and
|
|
* shifts the cursor. Returns false if we have passed the end of the input.
|
|
*/
|
|
[[nodiscard]] bool Next(typename base::Token& aToken);
|
|
|
|
/**
|
|
* Parse the token on the input read cursor position, check its type is equal
|
|
* to aTokenType and if so, put it into aResult, shift the cursor and return
|
|
* true. Otherwise, leave the input read cursor position intact and return
|
|
* false.
|
|
*/
|
|
[[nodiscard]] bool Check(const typename base::TokenType aTokenType,
|
|
typename base::Token& aResult);
|
|
/**
|
|
* Same as above method, just compares both token type and token value passed
|
|
* in aToken. When both the type and the value equals, shift the cursor and
|
|
* return true. Otherwise return false.
|
|
*/
|
|
[[nodiscard]] bool Check(const typename base::Token& aToken);
|
|
|
|
/**
|
|
* SkipWhites method (below) may also skip new line characters automatically.
|
|
*/
|
|
enum WhiteSkipping {
|
|
/**
|
|
* SkipWhites will only skip what is defined as a white space (default).
|
|
*/
|
|
DONT_INCLUDE_NEW_LINE = 0,
|
|
/**
|
|
* SkipWhites will skip definited white spaces as well as new lines
|
|
* automatically.
|
|
*/
|
|
INCLUDE_NEW_LINE = 1
|
|
};
|
|
|
|
/**
|
|
* Skips any occurence of whitespaces specified in mWhitespaces member,
|
|
* optionally skip also new lines.
|
|
*/
|
|
void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
|
|
|
|
/**
|
|
* Skips all tokens until the given one is found or EOF is hit. The token
|
|
* or EOF are next to read.
|
|
*/
|
|
void SkipUntil(typename base::Token const& aToken);
|
|
|
|
// These are mostly shortcuts for the Check() methods above.
|
|
|
|
/**
|
|
* Check whitespace character is present.
|
|
*/
|
|
[[nodiscard]] bool CheckWhite() { return Check(base::Token::Whitespace()); }
|
|
/**
|
|
* Check there is a single character on the read cursor position. If so,
|
|
* shift the read cursor position and return true. Otherwise false.
|
|
*/
|
|
[[nodiscard]] bool CheckChar(const TChar aChar) {
|
|
return Check(base::Token::Char(aChar));
|
|
}
|
|
/**
|
|
* This is a customizable version of CheckChar. aClassifier is a function
|
|
* called with value of the character on the current input read position. If
|
|
* this user function returns true, read cursor is shifted and true returned.
|
|
* Otherwise false. The user classifiction function is not called when we are
|
|
* at or past the end and false is immediately returned.
|
|
*/
|
|
[[nodiscard]] bool CheckChar(bool (*aClassifier)(const TChar aChar));
|
|
/**
|
|
* Check for a whole expected word.
|
|
*/
|
|
[[nodiscard]] bool CheckWord(const typename base::TAString& aWord) {
|
|
return Check(base::Token::Word(aWord));
|
|
}
|
|
/**
|
|
* Shortcut for literal const word check with compile time length calculation.
|
|
*/
|
|
template <uint32_t N>
|
|
[[nodiscard]] bool CheckWord(const TChar (&aWord)[N]) {
|
|
return Check(
|
|
base::Token::Word(typename base::TDependentString(aWord, N - 1)));
|
|
}
|
|
/**
|
|
* Helper to check for a string compound of multiple tokens like "foo bar".
|
|
* The match is binary-exact, a white space or a delimiter character in the
|
|
* phrase must match exactly the characters in the input.
|
|
*/
|
|
[[nodiscard]] bool CheckPhrase(const typename base::TAString& aPhrase);
|
|
template <uint32_t N>
|
|
[[nodiscard]] bool CheckPhrase(const TChar (&aPhrase)[N]) {
|
|
return CheckPhrase(typename base::TDependentString(aPhrase, N - 1));
|
|
}
|
|
/**
|
|
* Checks \r, \n or \r\n.
|
|
*/
|
|
[[nodiscard]] bool CheckEOL() { return Check(base::Token::NewLine()); }
|
|
/**
|
|
* Checks we are at the end of the input string reading. If so, shift past
|
|
* the end and returns true. Otherwise does nothing and returns false.
|
|
*/
|
|
[[nodiscard]] bool CheckEOF() { return Check(base::Token::EndOfFile()); }
|
|
|
|
/**
|
|
* These are shortcuts to obtain the value immediately when the token type
|
|
* matches.
|
|
*/
|
|
[[nodiscard]] bool ReadChar(TChar* aValue);
|
|
[[nodiscard]] bool ReadChar(bool (*aClassifier)(const TChar aChar),
|
|
TChar* aValue);
|
|
[[nodiscard]] bool ReadWord(typename base::TAString& aValue);
|
|
[[nodiscard]] bool ReadWord(typename base::TDependentSubstring& aValue);
|
|
|
|
/**
|
|
* This is an integer read helper. It returns false and doesn't move the read
|
|
* cursor when any of the following happens:
|
|
* - the token at the read cursor is not an integer
|
|
* - the final number doesn't fit the T type
|
|
* Otherwise true is returned, aValue is filled with the integral number
|
|
* and the cursor is moved forward.
|
|
*/
|
|
template <typename T>
|
|
[[nodiscard]] bool ReadInteger(T* aValue) {
|
|
MOZ_RELEASE_ASSERT(aValue);
|
|
|
|
typename base::TAString::const_char_iterator rollback = mRollback;
|
|
typename base::TAString::const_char_iterator cursor = base::mCursor;
|
|
typename base::Token t;
|
|
if (!Check(base::TOKEN_INTEGER, t)) {
|
|
return false;
|
|
}
|
|
|
|
mozilla::CheckedInt<T> checked(t.AsInteger());
|
|
if (!checked.isValid()) {
|
|
// Move to a state as if Check() call has failed
|
|
mRollback = rollback;
|
|
base::mCursor = cursor;
|
|
base::mHasFailed = true;
|
|
return false;
|
|
}
|
|
|
|
*aValue = checked.value();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Same as above, but accepts an integer with an optional minus sign.
|
|
*/
|
|
template <typename T, typename V = std::enable_if_t<
|
|
std::is_signed_v<std::remove_pointer_t<T>>,
|
|
std::remove_pointer_t<T>>>
|
|
[[nodiscard]] bool ReadSignedInteger(T* aValue) {
|
|
MOZ_RELEASE_ASSERT(aValue);
|
|
|
|
typename base::TAString::const_char_iterator rollback = mRollback;
|
|
typename base::TAString::const_char_iterator cursor = base::mCursor;
|
|
auto revert = MakeScopeExit([&] {
|
|
// Move to a state as if Check() call has failed
|
|
mRollback = rollback;
|
|
base::mCursor = cursor;
|
|
base::mHasFailed = true;
|
|
});
|
|
|
|
// Using functional raw access because '-' could be part of the word set
|
|
// making CheckChar('-') not work.
|
|
bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; });
|
|
|
|
typename base::Token t;
|
|
if (!Check(base::TOKEN_INTEGER, t)) {
|
|
return false;
|
|
}
|
|
|
|
mozilla::CheckedInt<T> checked(t.AsInteger());
|
|
if (minus) {
|
|
checked *= -1;
|
|
}
|
|
|
|
if (!checked.isValid()) {
|
|
return false;
|
|
}
|
|
|
|
*aValue = checked.value();
|
|
revert.release();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Returns the read cursor position back as it was before the last call of any
|
|
* parsing method of TTokenizer (Next, Check*, Skip*, Read*) so that the last
|
|
* operation can be repeated. Rollback cannot be used multiple times, it only
|
|
* reverts the last successfull parse operation. It also cannot be used
|
|
* before any parsing operation has been called on the TTokenizer.
|
|
*/
|
|
void Rollback();
|
|
|
|
/**
|
|
* Record() and Claim() are collecting the input as it is being parsed to
|
|
* obtain a substring between particular syntax bounderies defined by any
|
|
* recursive descent parser or simple parser the TTokenizer is used to read
|
|
* the input for. Inlucsion of a token that has just been parsed can be
|
|
* controlled using an arguemnt.
|
|
*/
|
|
enum ClaimInclusion {
|
|
/**
|
|
* Include resulting (or passed) token of the last lexical analyzer
|
|
* operation in the result.
|
|
*/
|
|
INCLUDE_LAST,
|
|
/**
|
|
* Do not include it.
|
|
*/
|
|
EXCLUDE_LAST
|
|
};
|
|
|
|
/**
|
|
* Start the process of recording. Based on aInclude value the begining of
|
|
* the recorded sub-string is at the current position (EXCLUDE_LAST) or at the
|
|
* position before the last parsed token (INCLUDE_LAST).
|
|
*/
|
|
void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
|
|
/**
|
|
* Claim result of the record started with Record() call before. Depending on
|
|
* aInclude the ending of the sub-string result includes or excludes the last
|
|
* parsed or checked token.
|
|
*/
|
|
void Claim(typename base::TAString& aResult,
|
|
ClaimInclusion aInclude = EXCLUDE_LAST);
|
|
void Claim(typename base::TDependentSubstring& aResult,
|
|
ClaimInclusion aInclude = EXCLUDE_LAST);
|
|
|
|
/**
|
|
* If aToken is found, aResult is set to the substring between the current
|
|
* position and the position of aToken, potentially including aToken depending
|
|
* on aInclude.
|
|
* If aToken isn't found aResult is set to the substring between the current
|
|
* position and the end of the string.
|
|
* If aToken is found, the method returns true. Otherwise it returns false.
|
|
*
|
|
* Calling Rollback() after ReadUntil() will return the read cursor to the
|
|
* position it had before ReadUntil was called.
|
|
*/
|
|
[[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
|
|
typename base::TDependentSubstring& aResult,
|
|
ClaimInclusion aInclude = EXCLUDE_LAST);
|
|
[[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
|
|
typename base::TAString& aResult,
|
|
ClaimInclusion aInclude = EXCLUDE_LAST);
|
|
|
|
protected:
|
|
// All these point to the original buffer passed to the TTokenizer's
|
|
// constructor
|
|
typename base::TAString::const_char_iterator
|
|
mRecord; // Position where the recorded sub-string for Claim() is
|
|
typename base::TAString::const_char_iterator
|
|
mRollback; // Position of the previous token start
|
|
|
|
private:
|
|
TTokenizer() = delete;
|
|
TTokenizer(const TTokenizer&) = delete;
|
|
TTokenizer(TTokenizer&&) = delete;
|
|
TTokenizer(const TTokenizer&&) = delete;
|
|
TTokenizer& operator=(const TTokenizer&) = delete;
|
|
};
|
|
|
|
typedef TTokenizer<char> Tokenizer;
|
|
typedef TTokenizer<char16_t> Tokenizer16;
|
|
|
|
} // namespace mozilla
|
|
|
|
#endif // Tokenizer_h__
|