зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1322825 - Incremental tokenizer. r=froydnj
This commit is contained in:
Родитель
5019a0d0df
Коммит
3a39b6dbee
|
@ -0,0 +1,195 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/IncrementalTokenizer.h"
|
||||
|
||||
#include "mozilla/AutoRestore.h"
|
||||
|
||||
#include "nsIInputStream.h"
|
||||
#include "IncrementalTokenizer.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
IncrementalTokenizer::IncrementalTokenizer(Consumer aConsumer,
|
||||
const char * aWhitespaces,
|
||||
const char * aAdditionalWordChars,
|
||||
uint32_t aRawMinBuffered)
|
||||
: TokenizerBase(aWhitespaces, aAdditionalWordChars)
|
||||
#ifdef DEBUG
|
||||
, mConsuming(false)
|
||||
#endif
|
||||
, mNeedMoreInput(false)
|
||||
, mRollback(false)
|
||||
, mInputCursor(0)
|
||||
, mConsumer(aConsumer)
|
||||
{
|
||||
mInputFinished = false;
|
||||
mMinRawDelivery = aRawMinBuffered;
|
||||
}
|
||||
|
||||
nsresult IncrementalTokenizer::FeedInput(const nsACString & aInput)
|
||||
{
|
||||
NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
|
||||
MOZ_ASSERT(!mInputFinished);
|
||||
|
||||
mInput.Cut(0, mInputCursor);
|
||||
mInputCursor = 0;
|
||||
|
||||
mInput.Append(aInput);
|
||||
|
||||
return Process();
|
||||
}
|
||||
|
||||
nsresult IncrementalTokenizer::FeedInput(nsIInputStream * aInput, uint32_t aCount)
|
||||
{
|
||||
NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
|
||||
MOZ_ASSERT(!mInputFinished);
|
||||
MOZ_ASSERT(!mConsuming);
|
||||
|
||||
mInput.Cut(0, mInputCursor);
|
||||
mInputCursor = 0;
|
||||
|
||||
nsresult rv = NS_OK;
|
||||
while (NS_SUCCEEDED(rv) && aCount) {
|
||||
nsCString::index_type remainder = mInput.Length();
|
||||
nsCString::index_type load =
|
||||
std::min<nsCString::index_type>(aCount, PR_UINT32_MAX - remainder);
|
||||
|
||||
if (!load) {
|
||||
// To keep the API simple, we fail if the input data buffer if filled.
|
||||
// It's highly unlikely there will ever be such amout of data cumulated
|
||||
// unless a logic fault in the consumer code.
|
||||
NS_ERROR("IncrementalTokenizer consumer not reading data?");
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
|
||||
if (!mInput.SetLength(remainder + load, fallible)) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
|
||||
nsCString::char_iterator buffer = mInput.BeginWriting() + remainder;
|
||||
|
||||
uint32_t read;
|
||||
rv = aInput->Read(buffer, load, &read);
|
||||
if (NS_SUCCEEDED(rv)) {
|
||||
// remainder + load fits the uint32_t size, so must remainder + read.
|
||||
mInput.SetLength(remainder + read);
|
||||
aCount -= read;
|
||||
|
||||
rv = Process();
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
nsresult IncrementalTokenizer::FinishInput()
|
||||
{
|
||||
NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
|
||||
MOZ_ASSERT(!mInputFinished);
|
||||
MOZ_ASSERT(!mConsuming);
|
||||
|
||||
mInput.Cut(0, mInputCursor);
|
||||
mInputCursor = 0;
|
||||
|
||||
mInputFinished = true;
|
||||
nsresult rv = Process();
|
||||
mConsumer = nullptr;
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool IncrementalTokenizer::Next(Token & aToken)
|
||||
{
|
||||
// Assert we are called only from the consumer callback
|
||||
MOZ_ASSERT(mConsuming);
|
||||
|
||||
if (mPastEof) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nsACString::const_char_iterator next = Parse(aToken);
|
||||
mPastEof = aToken.Type() == TOKEN_EOF;
|
||||
if (next == mCursor && !mPastEof) {
|
||||
// Not enough input to make a deterministic decision.
|
||||
return false;
|
||||
}
|
||||
|
||||
AssignFragment(aToken, mCursor, next);
|
||||
mCursor = next;
|
||||
return true;
|
||||
}
|
||||
|
||||
void IncrementalTokenizer::NeedMoreInput()
|
||||
{
|
||||
// Assert we are called only from the consumer callback
|
||||
MOZ_ASSERT(mConsuming);
|
||||
|
||||
// When the input has been finished, we can't set the flag to prevent
|
||||
// indefinite wait for more input (that will never come)
|
||||
mNeedMoreInput = !mInputFinished;
|
||||
}
|
||||
|
||||
void IncrementalTokenizer::Rollback()
|
||||
{
|
||||
// Assert we are called only from the consumer callback
|
||||
MOZ_ASSERT(mConsuming);
|
||||
|
||||
mRollback = true;
|
||||
}
|
||||
|
||||
nsresult IncrementalTokenizer::Process()
|
||||
{
|
||||
#ifdef DEBUG
|
||||
// Assert we are not re-entered
|
||||
MOZ_ASSERT(!mConsuming);
|
||||
|
||||
AutoRestore<bool> consuming(mConsuming);
|
||||
mConsuming = true;
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(!mPastEof);
|
||||
|
||||
nsresult rv = NS_OK;
|
||||
|
||||
mInput.BeginReading(mCursor);
|
||||
mCursor += mInputCursor;
|
||||
mInput.EndReading(mEnd);
|
||||
|
||||
while (NS_SUCCEEDED(rv) && !mPastEof) {
|
||||
Token token;
|
||||
nsACString::const_char_iterator next = Parse(token);
|
||||
mPastEof = token.Type() == TOKEN_EOF;
|
||||
if (next == mCursor && !mPastEof) {
|
||||
// Not enough input to make a deterministic decision.
|
||||
break;
|
||||
}
|
||||
|
||||
AssignFragment(token, mCursor, next);
|
||||
|
||||
nsACString::const_char_iterator rollback = mCursor;
|
||||
mCursor = next;
|
||||
|
||||
mNeedMoreInput = mRollback = false;
|
||||
|
||||
rv = mConsumer(token, *this);
|
||||
if (NS_FAILED(rv)) {
|
||||
break;
|
||||
}
|
||||
if (mNeedMoreInput || mRollback) {
|
||||
mCursor = rollback;
|
||||
mPastEof = false;
|
||||
if (mNeedMoreInput) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mInputCursor = mCursor - mInput.BeginReading();
|
||||
return rv;
|
||||
}
|
||||
|
||||
} // mozilla
|
|
@ -0,0 +1,122 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef INCREMENTAL_TOKENIZER_H__
|
||||
#define INCREMENTAL_TOKENIZER_H__
|
||||
|
||||
#include "mozilla/Tokenizer.h"
|
||||
|
||||
#include "nsError.h"
|
||||
#include <functional>
|
||||
|
||||
class nsIInputStream;
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
class IncrementalTokenizer : public TokenizerBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* The consumer callback. The function is called for every single token
|
||||
* as found in the input. Failure result returned by this callback stops
|
||||
* the tokenization immediately and bubbles to result of Feed/FinishInput.
|
||||
*
|
||||
* Fragment()s of consumed tokens are ensured to remain valid until next call to
|
||||
* Feed/FinishInput and are pointing to a single linear buffer. Hence, those can
|
||||
* be safely used to accumulate the data for processing after Feed/FinishInput
|
||||
* returned.
|
||||
*/
|
||||
typedef std::function<nsresult(Token const&, IncrementalTokenizer& i)> Consumer;
|
||||
|
||||
/**
|
||||
* For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase.
|
||||
*
|
||||
* @param aConsumer
|
||||
* A mandatory non-null argument, a function that consumes the tokens as they
|
||||
* come when the tokenizer is fed.
|
||||
* @param aRawMinBuffered
|
||||
* When we have buffered at least aRawMinBuffered data, but there was no custom
|
||||
* token found so far because of too small incremental feed chunks, deliver
|
||||
* the raw data to preserve streaming and to save memory. This only has effect
|
||||
* in OnlyCustomTokenizing mode.
|
||||
*/
|
||||
explicit IncrementalTokenizer(Consumer aConsumer,
|
||||
const char* aWhitespaces = nullptr,
|
||||
const char* aAdditionalWordChars = nullptr,
|
||||
uint32_t aRawMinBuffered = 1024);
|
||||
|
||||
/**
|
||||
* Pushes the input to be tokenized. These directly call the Consumer callback
|
||||
* on every found token. Result of the Consumer callback is returned here.
|
||||
*
|
||||
* The tokenizer must be initialized with a valid consumer prior call to these
|
||||
* methods. It's not allowed to call Feed/FinishInput from inside the Consumer
|
||||
* callback.
|
||||
*/
|
||||
nsresult FeedInput(const nsACString& aInput);
|
||||
nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount);
|
||||
nsresult FinishInput();
|
||||
|
||||
/**
|
||||
* Can only be called from inside the consumer callback.
|
||||
*
|
||||
* When there is still anything to read from the input, tokenize it, store
|
||||
* the token type and value to aToken result and shift the cursor past this
|
||||
* just parsed token. Each call to Next() reads another token from
|
||||
* the input and shifts the cursor.
|
||||
*
|
||||
* Returns false if there is not enough data to deterministically recognize
|
||||
* tokens or when the last returned token was EOF.
|
||||
*/
|
||||
MOZ_MUST_USE
|
||||
bool Next(Token& aToken);
|
||||
|
||||
/**
|
||||
* Can only be called from inside the consumer callback.
|
||||
*
|
||||
* Tells the tokenizer to revert the cursor and stop the async parsing until
|
||||
* next feed of the input. This is useful when more than one token is needed
|
||||
* to decide on the syntax but there is not enough input to get a next token
|
||||
* (Next() returned false.)
|
||||
*/
|
||||
void NeedMoreInput();
|
||||
|
||||
/**
|
||||
* Can only be called from inside the consumer callback.
|
||||
*
|
||||
* This makes the consumer callback be called again while parsing
|
||||
* the input at the previous cursor position again. This is useful when
|
||||
* the tokenizer state (custom tokens, tokenization mode) has changed and
|
||||
* we want to re-parse the input again.
|
||||
*/
|
||||
void Rollback();
|
||||
|
||||
private:
|
||||
// Loops over the input with TokenizerBase::Parse and calls the Consumer callback.
|
||||
nsresult Process();
|
||||
|
||||
#ifdef DEBUG
|
||||
// True when inside the consumer callback, used only for assertions.
|
||||
bool mConsuming;
|
||||
#endif // DEBUG
|
||||
// Modifyable only from the Consumer callback, tells the parser to break, rollback
|
||||
// and wait for more input.
|
||||
bool mNeedMoreInput;
|
||||
// Modifyable only from the Consumer callback, tells the parser to rollback and
|
||||
// parse the input again, with (if modified) new settings of the tokenizer.
|
||||
bool mRollback;
|
||||
// The input buffer. Updated with each call to Feed/FinishInput.
|
||||
nsCString mInput;
|
||||
// Numerical index pointing at the current cursor position. We don't keep direct
|
||||
// reference to the string buffer since the buffer gets often reallocated.
|
||||
nsCString::index_type mInputCursor;
|
||||
// Refernce to the consumer function.
|
||||
Consumer mConsumer;
|
||||
};
|
||||
|
||||
} // mozilla
|
||||
|
||||
#endif
|
|
@ -7,6 +7,7 @@
|
|||
#include "Tokenizer.h"
|
||||
|
||||
#include "nsUnicharUtils.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
|
@ -15,11 +16,9 @@ static const char sWhitespaces[] = " \t";
|
|||
Tokenizer::Tokenizer(const nsACString& aSource,
|
||||
const char* aWhitespaces,
|
||||
const char* aAdditionalWordChars)
|
||||
: mPastEof(false)
|
||||
, mHasFailed(false)
|
||||
, mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
|
||||
, mAdditionalWordChars(aAdditionalWordChars)
|
||||
: TokenizerBase(aWhitespaces, aAdditionalWordChars)
|
||||
{
|
||||
mInputFinished = true;
|
||||
aSource.BeginReading(mCursor);
|
||||
mRecord = mRollback = mCursor;
|
||||
aSource.EndReading(mEnd);
|
||||
|
@ -43,7 +42,7 @@ Tokenizer::Next(Token& aToken)
|
|||
mRollback = mCursor;
|
||||
mCursor = Parse(aToken);
|
||||
|
||||
aToken.AssignFragment(mRollback, mCursor);
|
||||
AssignFragment(aToken, mRollback, mCursor);
|
||||
|
||||
mPastEof = aToken.Type() == TOKEN_EOF;
|
||||
mHasFailed = false;
|
||||
|
@ -67,7 +66,7 @@ Tokenizer::Check(const TokenType aTokenType, Token& aResult)
|
|||
mRollback = mCursor;
|
||||
mCursor = next;
|
||||
|
||||
aResult.AssignFragment(mRollback, mCursor);
|
||||
AssignFragment(aResult, mRollback, mCursor);
|
||||
|
||||
mPastEof = aResult.Type() == TOKEN_EOF;
|
||||
mHasFailed = false;
|
||||
|
@ -96,12 +95,6 @@ Tokenizer::Check(const Token& aToken)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Tokenizer::HasFailed() const
|
||||
{
|
||||
return mHasFailed;
|
||||
}
|
||||
|
||||
void
|
||||
Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines)
|
||||
{
|
||||
|
@ -275,24 +268,156 @@ Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
|
|||
aResult.Rebind(mRecord, close - mRecord);
|
||||
}
|
||||
|
||||
// protected
|
||||
// TokenizerBase
|
||||
|
||||
TokenizerBase::TokenizerBase(const char* aWhitespaces,
|
||||
const char* aAdditionalWordChars)
|
||||
: mPastEof(false)
|
||||
, mHasFailed(false)
|
||||
, mInputFinished(true)
|
||||
, mMode(Mode::FULL)
|
||||
, mMinRawDelivery(1024)
|
||||
, mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
|
||||
, mAdditionalWordChars(aAdditionalWordChars)
|
||||
, mCursor(nullptr)
|
||||
, mEnd(nullptr)
|
||||
, mNextCustomTokenID(TOKEN_CUSTOM0)
|
||||
{
|
||||
}
|
||||
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::AddCustomToken(const nsACString & aValue,
|
||||
ECaseSensitivity aCaseInsensitivity, bool aEnabled)
|
||||
{
|
||||
MOZ_ASSERT(!aValue.IsEmpty());
|
||||
|
||||
UniquePtr<Token>& t = *mCustomTokens.AppendElement();
|
||||
t = MakeUnique<Token>();
|
||||
|
||||
t->mType = static_cast<TokenType>(++mNextCustomTokenID);
|
||||
t->mCustomCaseInsensitivity = aCaseInsensitivity;
|
||||
t->mCustomEnabled = aEnabled;
|
||||
t->mCustom.Assign(aValue);
|
||||
return *t;
|
||||
}
|
||||
|
||||
void
|
||||
TokenizerBase::RemoveCustomToken(Token& aToken)
|
||||
{
|
||||
if (aToken.mType == TOKEN_UNKNOWN) {
|
||||
// Already removed
|
||||
return;
|
||||
}
|
||||
|
||||
for (UniquePtr<Token> const& custom : mCustomTokens) {
|
||||
if (custom->mType == aToken.mType) {
|
||||
mCustomTokens.RemoveElement(custom);
|
||||
aToken.mType = TOKEN_UNKNOWN;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ASSERT(false, "Token to remove not found");
|
||||
}
|
||||
|
||||
void
|
||||
TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled)
|
||||
{
|
||||
if (aToken.mType == TOKEN_UNKNOWN) {
|
||||
// Already removed
|
||||
return;
|
||||
}
|
||||
|
||||
for (UniquePtr<Token> const& custom : mCustomTokens) {
|
||||
if (custom->Type() == aToken.Type()) {
|
||||
// This effectively destroys the token instance.
|
||||
custom->mCustomEnabled = aEnabled;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ASSERT(false, "Token to change not found");
|
||||
}
|
||||
|
||||
void
|
||||
TokenizerBase::SetTokenizingMode(Mode aMode)
|
||||
{
|
||||
mMode = aMode;
|
||||
}
|
||||
|
||||
bool
|
||||
Tokenizer::HasInput() const
|
||||
TokenizerBase::HasFailed() const
|
||||
{
|
||||
return mHasFailed;
|
||||
}
|
||||
|
||||
bool
|
||||
TokenizerBase::HasInput() const
|
||||
{
|
||||
return !mPastEof;
|
||||
}
|
||||
|
||||
nsACString::const_char_iterator
|
||||
Tokenizer::Parse(Token& aToken) const
|
||||
TokenizerBase::Parse(Token& aToken) const
|
||||
{
|
||||
if (mCursor == mEnd) {
|
||||
if (!mInputFinished) {
|
||||
return mCursor;
|
||||
}
|
||||
|
||||
aToken = Token::EndOfFile();
|
||||
return mEnd;
|
||||
}
|
||||
|
||||
nsACString::size_type available = mEnd - mCursor;
|
||||
|
||||
uint32_t longestCustom = 0;
|
||||
for (UniquePtr<Token> const& custom : mCustomTokens) {
|
||||
if (IsCustom(mCursor, *custom, &longestCustom)) {
|
||||
aToken = *custom;
|
||||
return mCursor + custom->mCustom.Length();
|
||||
}
|
||||
}
|
||||
|
||||
if (!mInputFinished && available < longestCustom) {
|
||||
// Not enough data to deterministically decide.
|
||||
return mCursor;
|
||||
}
|
||||
|
||||
nsACString::const_char_iterator next = mCursor;
|
||||
|
||||
if (mMode == Mode::CUSTOM_ONLY) {
|
||||
// We have to do a brute-force search for all of the enabled custom
|
||||
// tokens.
|
||||
while (next < mEnd) {
|
||||
++next;
|
||||
for (UniquePtr<Token> const& custom : mCustomTokens) {
|
||||
if (IsCustom(next, *custom)) {
|
||||
aToken = Token::Raw();
|
||||
return next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mInputFinished) {
|
||||
// End of the data reached.
|
||||
aToken = Token::Raw();
|
||||
return next;
|
||||
}
|
||||
|
||||
if (longestCustom < available && available > mMinRawDelivery) {
|
||||
// We can return some data w/o waiting for either a custom token
|
||||
// or call to FinishData() when we leave the tail where all the
|
||||
// custom tokens potentially fit, so we can't lose only partially
|
||||
// delivered tokens. This preserves reasonable granularity.
|
||||
aToken = Token::Raw();
|
||||
return mEnd - longestCustom + 1;
|
||||
}
|
||||
|
||||
// Not enough data to deterministically decide.
|
||||
return mCursor;
|
||||
}
|
||||
|
||||
enum State {
|
||||
PARSE_INTEGER,
|
||||
PARSE_WORD,
|
||||
|
@ -326,6 +451,9 @@ Tokenizer::Parse(Token& aToken) const
|
|||
resultingNumber += static_cast<uint64_t>(*next - '0');
|
||||
|
||||
++next;
|
||||
if (IsPending(next)) {
|
||||
break;
|
||||
}
|
||||
if (IsEnd(next) || !IsNumber(*next)) {
|
||||
if (!resultingNumber.isValid()) {
|
||||
aToken = Token::Error();
|
||||
|
@ -338,6 +466,9 @@ Tokenizer::Parse(Token& aToken) const
|
|||
|
||||
case PARSE_WORD:
|
||||
++next;
|
||||
if (IsPending(next)) {
|
||||
break;
|
||||
}
|
||||
if (IsEnd(next) || !IsWord(*next)) {
|
||||
aToken = Token::Word(Substring(mCursor, next));
|
||||
return next;
|
||||
|
@ -346,6 +477,9 @@ Tokenizer::Parse(Token& aToken) const
|
|||
|
||||
case PARSE_CRLF:
|
||||
++next;
|
||||
if (IsPending(next)) {
|
||||
break;
|
||||
}
|
||||
if (!IsEnd(next) && *next == '\n') { // LF is optional
|
||||
++next;
|
||||
}
|
||||
|
@ -369,17 +503,24 @@ Tokenizer::Parse(Token& aToken) const
|
|||
} // switch (state)
|
||||
} // while (next < end)
|
||||
|
||||
return next;
|
||||
MOZ_ASSERT(!mInputFinished);
|
||||
return mCursor;
|
||||
}
|
||||
|
||||
bool
|
||||
Tokenizer::IsEnd(const nsACString::const_char_iterator& caret) const
|
||||
TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const
|
||||
{
|
||||
return caret == mEnd;
|
||||
}
|
||||
|
||||
bool
|
||||
Tokenizer::IsWordFirst(const char aInput) const
|
||||
TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const
|
||||
{
|
||||
return IsEnd(caret) && !mInputFinished;
|
||||
}
|
||||
|
||||
bool
|
||||
TokenizerBase::IsWordFirst(const char aInput) const
|
||||
{
|
||||
// TODO: make this fully work with unicode
|
||||
return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
|
||||
|
@ -389,50 +530,107 @@ Tokenizer::IsWordFirst(const char aInput) const
|
|||
}
|
||||
|
||||
bool
|
||||
Tokenizer::IsWord(const char aInput) const
|
||||
TokenizerBase::IsWord(const char aInput) const
|
||||
{
|
||||
return IsWordFirst(aInput) || IsNumber(aInput);
|
||||
}
|
||||
|
||||
bool
|
||||
Tokenizer::IsNumber(const char aInput) const
|
||||
TokenizerBase::IsNumber(const char aInput) const
|
||||
{
|
||||
// TODO: are there unicode numbers?
|
||||
return aInput >= '0' && aInput <= '9';
|
||||
}
|
||||
|
||||
// Tokenizer::Token
|
||||
bool
|
||||
TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret,
|
||||
const Token & aCustomToken,
|
||||
uint32_t * aLongest) const
|
||||
{
|
||||
MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
|
||||
if (!aCustomToken.mCustomEnabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Tokenizer::Token::Token(const Token& aOther)
|
||||
if (aLongest) {
|
||||
*aLongest = std::max(*aLongest, aCustomToken.mCustom.Length());
|
||||
}
|
||||
|
||||
uint32_t inputLength = mEnd - caret;
|
||||
if (aCustomToken.mCustom.Length() > inputLength) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length());
|
||||
if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
|
||||
return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator());
|
||||
}
|
||||
return inputFragment.Equals(aCustomToken.mCustom);
|
||||
}
|
||||
|
||||
void TokenizerBase::AssignFragment(Token& aToken,
|
||||
nsACString::const_char_iterator begin,
|
||||
nsACString::const_char_iterator end)
|
||||
{
|
||||
aToken.AssignFragment(begin, end);
|
||||
}
|
||||
|
||||
// TokenizerBase::Token
|
||||
|
||||
TokenizerBase::Token::Token()
|
||||
: mType(TOKEN_UNKNOWN)
|
||||
, mChar(0)
|
||||
, mInteger(0)
|
||||
, mCustomCaseInsensitivity(CASE_SENSITIVE)
|
||||
, mCustomEnabled(false)
|
||||
{
|
||||
}
|
||||
|
||||
TokenizerBase::Token::Token(const Token& aOther)
|
||||
: mType(aOther.mType)
|
||||
, mCustom(aOther.mCustom)
|
||||
, mChar(aOther.mChar)
|
||||
, mInteger(aOther.mInteger)
|
||||
, mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity)
|
||||
, mCustomEnabled(aOther.mCustomEnabled)
|
||||
{
|
||||
if (mType == TOKEN_WORD) {
|
||||
if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
|
||||
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
|
||||
}
|
||||
}
|
||||
|
||||
Tokenizer::Token&
|
||||
Tokenizer::Token::operator=(const Token& aOther)
|
||||
TokenizerBase::Token&
|
||||
TokenizerBase::Token::operator=(const Token& aOther)
|
||||
{
|
||||
mType = aOther.mType;
|
||||
mCustom = aOther.mCustom;
|
||||
mChar = aOther.mChar;
|
||||
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
|
||||
mInteger = aOther.mInteger;
|
||||
mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
|
||||
mCustomEnabled = aOther.mCustomEnabled;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void
|
||||
Tokenizer::Token::AssignFragment(nsACString::const_char_iterator begin,
|
||||
TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin,
|
||||
nsACString::const_char_iterator end)
|
||||
{
|
||||
mFragment.Rebind(begin, end - begin);
|
||||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::Word(const nsACString& aValue)
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Raw()
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_RAW;
|
||||
return t;
|
||||
}
|
||||
|
||||
// static
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Word(const nsACString& aValue)
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_WORD;
|
||||
|
@ -441,8 +639,8 @@ Tokenizer::Token::Word(const nsACString& aValue)
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::Char(const char aValue)
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Char(const char aValue)
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_CHAR;
|
||||
|
@ -451,8 +649,8 @@ Tokenizer::Token::Char(const char aValue)
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::Number(const uint64_t aValue)
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Number(const uint64_t aValue)
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_INTEGER;
|
||||
|
@ -461,8 +659,8 @@ Tokenizer::Token::Number(const uint64_t aValue)
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::Whitespace()
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Whitespace()
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_WS;
|
||||
|
@ -471,8 +669,8 @@ Tokenizer::Token::Whitespace()
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::NewLine()
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::NewLine()
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_EOL;
|
||||
|
@ -480,8 +678,8 @@ Tokenizer::Token::NewLine()
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::EndOfFile()
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::EndOfFile()
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_EOF;
|
||||
|
@ -489,8 +687,8 @@ Tokenizer::Token::EndOfFile()
|
|||
}
|
||||
|
||||
// static
|
||||
Tokenizer::Token
|
||||
Tokenizer::Token::Error()
|
||||
TokenizerBase::Token
|
||||
TokenizerBase::Token::Error()
|
||||
{
|
||||
Token t;
|
||||
t.mType = TOKEN_ERROR;
|
||||
|
@ -498,7 +696,7 @@ Tokenizer::Token::Error()
|
|||
}
|
||||
|
||||
bool
|
||||
Tokenizer::Token::Equals(const Token& aOther) const
|
||||
TokenizerBase::Token::Equals(const Token& aOther) const
|
||||
{
|
||||
if (mType != aOther.mType) {
|
||||
return false;
|
||||
|
@ -517,21 +715,21 @@ Tokenizer::Token::Equals(const Token& aOther) const
|
|||
}
|
||||
|
||||
char
|
||||
Tokenizer::Token::AsChar() const
|
||||
TokenizerBase::Token::AsChar() const
|
||||
{
|
||||
MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
|
||||
return mChar;
|
||||
}
|
||||
|
||||
nsDependentCSubstring
|
||||
Tokenizer::Token::AsString() const
|
||||
TokenizerBase::Token::AsString() const
|
||||
{
|
||||
MOZ_ASSERT(mType == TOKEN_WORD);
|
||||
return mWord;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
Tokenizer::Token::AsInteger() const
|
||||
TokenizerBase::Token::AsInteger() const
|
||||
{
|
||||
MOZ_ASSERT(mType == TOKEN_INTEGER);
|
||||
return mInteger;
|
||||
|
|
|
@ -9,32 +9,36 @@
|
|||
|
||||
#include "nsString.h"
|
||||
#include "mozilla/CheckedInt.h"
|
||||
#include "mozilla/UniquePtr.h"
|
||||
#include "nsTArray.h"
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
/**
|
||||
* This is a simple implementation of a lexical analyzer or maybe better
|
||||
* called a tokenizer. It doesn't allow any user dictionaries or
|
||||
* user define token types.
|
||||
*
|
||||
* It is limited only to ASCII input for now. UTF-8 or any other input
|
||||
* encoding must yet be implemented.
|
||||
*/
|
||||
class Tokenizer {
|
||||
class TokenizerBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* The analyzer works with elements in the input cut to a sequence of token
|
||||
* where each token has an elementary type
|
||||
*/
|
||||
enum TokenType {
|
||||
enum TokenType : uint32_t
|
||||
{
|
||||
TOKEN_UNKNOWN,
|
||||
TOKEN_RAW,
|
||||
TOKEN_ERROR,
|
||||
TOKEN_INTEGER,
|
||||
TOKEN_WORD,
|
||||
TOKEN_CHAR,
|
||||
TOKEN_WS,
|
||||
TOKEN_EOL,
|
||||
TOKEN_EOF
|
||||
TOKEN_EOF,
|
||||
TOKEN_CUSTOM0 = 1000
|
||||
};
|
||||
|
||||
enum ECaseSensitivity
|
||||
{
|
||||
CASE_SENSITIVE,
|
||||
CASE_INSENSITIVE
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -42,23 +46,29 @@ public:
|
|||
* to allow checks against it via methods of Tokenizer or are results of some of
|
||||
* the Tokenizer's methods.
|
||||
*/
|
||||
class Token {
|
||||
class Token
|
||||
{
|
||||
TokenType mType;
|
||||
nsDependentCSubstring mWord;
|
||||
nsCString mCustom;
|
||||
char mChar;
|
||||
uint64_t mInteger;
|
||||
ECaseSensitivity mCustomCaseInsensitivity;
|
||||
bool mCustomEnabled;
|
||||
|
||||
// If this token is a result of the parsing process, this member is referencing
|
||||
// a sub-string in the input buffer. If this is externally created Token this
|
||||
// member is left an empty string.
|
||||
nsDependentCSubstring mFragment;
|
||||
|
||||
friend class Tokenizer;
|
||||
friend class TokenizerBase;
|
||||
void AssignFragment(nsACString::const_char_iterator begin,
|
||||
nsACString::const_char_iterator end);
|
||||
|
||||
static Token Raw();
|
||||
|
||||
public:
|
||||
Token() : mType(TOKEN_UNKNOWN), mChar(0), mInteger(0) {}
|
||||
Token();
|
||||
Token(const Token& aOther);
|
||||
Token& operator=(const Token& aOther);
|
||||
|
||||
|
@ -83,6 +93,120 @@ public:
|
|||
nsDependentCSubstring Fragment() const { return mFragment; }
|
||||
};
|
||||
|
||||
/**
|
||||
* Consumers may register a custom string that, when found in the input, is considered
|
||||
* a token and returned by Next*() and accepted by Check*() methods.
|
||||
* AddCustomToken() returns a reference to a token that can then be comapred using
|
||||
* Token::Equals() againts the output from Next*() or be passed to Check*().
|
||||
*/
|
||||
Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
|
||||
template <uint32_t N>
|
||||
Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
|
||||
{
|
||||
return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
|
||||
}
|
||||
void RemoveCustomToken(Token& aToken);
|
||||
/**
|
||||
* Only applies to a custom type of a Token (see AddCustomToken above.)
|
||||
* This turns on and off token recognition. When a custom token is disabled,
|
||||
* it's ignored as never added as a custom token.
|
||||
*/
|
||||
void EnableCustomToken(Token const& aToken, bool aEnable);
|
||||
|
||||
/**
|
||||
* Mode of tokenization.
|
||||
* FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
|
||||
* if added.
|
||||
* CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
|
||||
* This mode can be understood as a 'binary' mode.
|
||||
*/
|
||||
enum class Mode
|
||||
{
|
||||
FULL,
|
||||
CUSTOM_ONLY
|
||||
};
|
||||
void SetTokenizingMode(Mode aMode);
|
||||
|
||||
/**
|
||||
* Return false iff the last Check*() call has returned false or when we've read past
|
||||
* the end of the input string.
|
||||
*/
|
||||
MOZ_MUST_USE bool HasFailed() const;
|
||||
|
||||
protected:
|
||||
explicit TokenizerBase(const char* aWhitespaces = nullptr,
|
||||
const char* aAdditionalWordChars = nullptr);
|
||||
|
||||
// false if we have already read the EOF token.
|
||||
bool HasInput() const;
|
||||
// Main parsing function, it doesn't shift the read cursor, just returns the next
|
||||
// token position.
|
||||
nsACString::const_char_iterator Parse(Token& aToken) const;
|
||||
// Is read cursor at the end?
|
||||
bool IsEnd(const nsACString::const_char_iterator& caret) const;
|
||||
// True, when we are at the end of the input data, but it has not been marked
|
||||
// as complete yet. In that case we cannot proceed with providing a multi-char token.
|
||||
bool IsPending(const nsACString::const_char_iterator & caret) const;
|
||||
// Is read cursor on a character that is a word start?
|
||||
bool IsWordFirst(const char aInput) const;
|
||||
// Is read cursor on a character that is an in-word letter?
|
||||
bool IsWord(const char aInput) const;
|
||||
// Is read cursor on a character that is a valid number?
|
||||
// TODO - support multiple radix
|
||||
bool IsNumber(const char aInput) const;
|
||||
// Is equal to the given custom token?
|
||||
bool IsCustom(const nsACString::const_char_iterator& caret,
|
||||
const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
|
||||
|
||||
// Friendly helper to assign a fragment on a Token
|
||||
static void AssignFragment(Token& aToken,
|
||||
nsACString::const_char_iterator begin,
|
||||
nsACString::const_char_iterator end);
|
||||
|
||||
// true iff we have already read the EOF token
|
||||
bool mPastEof;
|
||||
// true iff the last Check*() call has returned false, reverts to true on Rollback() call
|
||||
bool mHasFailed;
|
||||
// true if the input string is final (finished), false when we expect more data
|
||||
// yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
|
||||
bool mInputFinished;
|
||||
// custom only vs full tokenizing mode, see the Parse() method
|
||||
Mode mMode;
|
||||
// minimal raw data chunked delivery during incremental feed
|
||||
uint32_t mMinRawDelivery;
|
||||
|
||||
// Customizable list of whitespaces
|
||||
const char* mWhitespaces;
|
||||
// Additinal custom word characters
|
||||
const char* mAdditionalWordChars;
|
||||
|
||||
// All these point to the original buffer passed to the constructor or to the incremental
|
||||
// buffer after FeedInput.
|
||||
nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
|
||||
nsACString::const_char_iterator mEnd; // End of the input position
|
||||
|
||||
// This is the list of tokens user has registered with AddCustomToken()
|
||||
nsTArray<UniquePtr<Token>> mCustomTokens;
|
||||
uint32_t mNextCustomTokenID;
|
||||
|
||||
private:
|
||||
TokenizerBase() = delete;
|
||||
TokenizerBase(const TokenizerBase&) = delete;
|
||||
TokenizerBase(TokenizerBase&&) = delete;
|
||||
TokenizerBase(const TokenizerBase&&) = delete;
|
||||
TokenizerBase &operator=(const TokenizerBase&) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* This is a simple implementation of a lexical analyzer or maybe better
|
||||
* called a tokenizer. It doesn't allow any user dictionaries or
|
||||
* user define token types.
|
||||
*
|
||||
* It is limited only to ASCII input for now. UTF-8 or any other input
|
||||
* encoding must yet be implemented.
|
||||
*/
|
||||
class Tokenizer : public TokenizerBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param aSource
|
||||
|
@ -133,13 +257,6 @@ public:
|
|||
MOZ_MUST_USE
|
||||
bool Check(const Token& aToken);
|
||||
|
||||
/**
|
||||
* Return false iff the last Check*() call has returned false or when we've read past
|
||||
* the end of the input string.
|
||||
*/
|
||||
MOZ_MUST_USE
|
||||
bool HasFailed() const;
|
||||
|
||||
/**
|
||||
* SkipWhites method (below) may also skip new line characters automatically.
|
||||
*/
|
||||
|
@ -312,36 +429,9 @@ public:
|
|||
ClaimInclusion aInclude = EXCLUDE_LAST);
|
||||
|
||||
protected:
|
||||
// false if we have already read the EOF token.
|
||||
bool HasInput() const;
|
||||
// Main parsing function, it doesn't shift the read cursor, just returns the next
|
||||
// token position.
|
||||
nsACString::const_char_iterator Parse(Token& aToken) const;
|
||||
// Is read cursor at the end?
|
||||
bool IsEnd(const nsACString::const_char_iterator& caret) const;
|
||||
// Is read cursor on a character that is a word start?
|
||||
bool IsWordFirst(const char aInput) const;
|
||||
// Is read cursor on a character that is an in-word letter?
|
||||
bool IsWord(const char aInput) const;
|
||||
// Is read cursor on a character that is a valid number?
|
||||
// TODO - support multiple radix
|
||||
bool IsNumber(const char aInput) const;
|
||||
|
||||
// true iff we have already read the EOF token
|
||||
bool mPastEof;
|
||||
// true iff the last Check*() call has returned false, reverts to true on Rollback() call
|
||||
bool mHasFailed;
|
||||
|
||||
// Customizable list of whitespaces
|
||||
const char* mWhitespaces;
|
||||
// Additinal custom word characters
|
||||
const char* mAdditionalWordChars;
|
||||
|
||||
// All these point to the original buffer passed to the Tokenizer
|
||||
// All these point to the original buffer passed to the Tokenizer's constructor
|
||||
nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
|
||||
nsACString::const_char_iterator mRollback; // Position of the previous token start
|
||||
nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
|
||||
nsACString::const_char_iterator mEnd; // End of the input position
|
||||
|
||||
private:
|
||||
Tokenizer() = delete;
|
||||
|
|
|
@ -83,12 +83,14 @@ EXPORTS += [
|
|||
]
|
||||
|
||||
EXPORTS.mozilla += [
|
||||
'IncrementalTokenizer.h',
|
||||
'Observer.h',
|
||||
'StickyTimeDuration.h',
|
||||
'Tokenizer.h',
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
'IncrementalTokenizer.cpp',
|
||||
'nsArray.cpp',
|
||||
'nsArrayEnumerator.cpp',
|
||||
'nsArrayUtils.cpp',
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/Tokenizer.h"
|
||||
#include "mozilla/IncrementalTokenizer.h"
|
||||
#include "mozilla/Unused.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace mozilla;
|
||||
|
@ -732,3 +734,401 @@ TEST(Tokenizer, SkipUntil)
|
|||
EXPECT_TRUE(p.CheckEOF());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Tokenizer, Custom)
|
||||
{
|
||||
Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
|
||||
|
||||
Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
|
||||
Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
|
||||
|
||||
// It's expected to NOT FIND the custom token if it's not on an edge
|
||||
// between other recognizable tokens.
|
||||
EXPECT_TRUE(p.CheckWord("aaaaaacustom"));
|
||||
EXPECT_TRUE(p.CheckChar('-'));
|
||||
EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
|
||||
EXPECT_TRUE(p.CheckEOL());
|
||||
EXPECT_TRUE(p.CheckChar(','));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
EXPECT_TRUE(p.CheckChar(','));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
EXPECT_TRUE(p.CheckChar(','));
|
||||
|
||||
p.EnableCustomToken(c1, false);
|
||||
EXPECT_TRUE(p.CheckWord("Custom"));
|
||||
EXPECT_TRUE(p.CheckChar('-'));
|
||||
EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
|
||||
EXPECT_TRUE(p.CheckChar(','));
|
||||
|
||||
EXPECT_TRUE(p.Check(Tokenizer::Token::Number(0)));
|
||||
EXPECT_TRUE(p.Check(c2));
|
||||
EXPECT_TRUE(p.CheckWord("xxxx"));
|
||||
EXPECT_TRUE(p.CheckChar(','));
|
||||
|
||||
EXPECT_TRUE(p.CheckWord("CUSTOM"));
|
||||
EXPECT_TRUE(p.CheckChar('-'));
|
||||
EXPECT_TRUE(p.Check(Tokenizer::Token::Number(2)));
|
||||
|
||||
EXPECT_TRUE(p.CheckEOF());
|
||||
}
|
||||
|
||||
TEST(Tokenizer, CustomRaw)
|
||||
{
|
||||
Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
|
||||
|
||||
Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
|
||||
Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
|
||||
|
||||
// In this mode it's expected to find all custom tokens among any kind of input.
|
||||
p.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
|
||||
|
||||
Tokenizer::Token t;
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral("aaaaaa"));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral("\r,"));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
|
||||
|
||||
EXPECT_TRUE(p.Check(c1));
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral(",00"));
|
||||
|
||||
EXPECT_TRUE(p.Check(c2));
|
||||
|
||||
EXPECT_TRUE(p.Next(t));
|
||||
EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
|
||||
EXPECT_TRUE(t.Fragment().EqualsLiteral("xxxx,CUSTOM-2"));
|
||||
|
||||
EXPECT_TRUE(p.CheckEOF());
|
||||
}
|
||||
|
||||
TEST(Tokenizer, Incremental)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
|
||||
case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
|
||||
case 4: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 7: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
|
||||
case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
});
|
||||
|
||||
NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
|
||||
auto cur = input.BeginReading();
|
||||
auto end = input.EndReading();
|
||||
for (; cur < end; ++cur) {
|
||||
i.FeedInput(nsDependentCSubstring(cur, 1));
|
||||
}
|
||||
|
||||
EXPECT_TRUE(test == 6);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 8);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalRollback)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
|
||||
case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2"))));
|
||||
i.Rollback(); // so that we get the token again
|
||||
break;
|
||||
case 4: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
|
||||
case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 7: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
|
||||
case 8: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
|
||||
case 9: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
});
|
||||
|
||||
NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
|
||||
auto cur = input.BeginReading();
|
||||
auto end = input.EndReading();
|
||||
for (; cur < end; ++cur) {
|
||||
i.FeedInput(nsDependentCSubstring(cur, 1));
|
||||
}
|
||||
|
||||
EXPECT_TRUE(test == 7);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 9);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalNeedMoreInput)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
Token t2;
|
||||
switch (++test) {
|
||||
case 1:
|
||||
EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("a"))));
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
EXPECT_TRUE(t.Equals(Token::Whitespace()));
|
||||
if (i.Next(t2)) {
|
||||
EXPECT_TRUE(test == 5);
|
||||
EXPECT_TRUE(t2.Equals(Token::Word(NS_LITERAL_CSTRING("bb"))));
|
||||
} else {
|
||||
EXPECT_TRUE(test < 5);
|
||||
i.NeedMoreInput();
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
EXPECT_TRUE(t.Equals(Token::Char(',')));
|
||||
break;
|
||||
case 7:
|
||||
EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("c"))));
|
||||
return NS_ERROR_FAILURE;
|
||||
default:
|
||||
EXPECT_TRUE(false);
|
||||
break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
});
|
||||
|
||||
NS_NAMED_LITERAL_CSTRING(input, "a bb,c");
|
||||
auto cur = input.BeginReading();
|
||||
auto end = input.EndReading();
|
||||
|
||||
nsresult rv;
|
||||
for (; cur < end; ++cur) {
|
||||
rv = i.FeedInput(nsDependentCSubstring(cur, 1));
|
||||
if (NS_FAILED(rv)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_TRUE(rv == NS_OK);
|
||||
EXPECT_TRUE(test == 6);
|
||||
|
||||
rv = i.FinishInput();
|
||||
EXPECT_TRUE(rv == NS_ERROR_FAILURE);
|
||||
EXPECT_TRUE(test == 7);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalCustom)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
Token custom;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Equals(custom)); break;
|
||||
case 2: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("bla")))); break;
|
||||
case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}, nullptr, "-");
|
||||
|
||||
custom = i.AddCustomToken("some-test", Tokenizer::CASE_SENSITIVE);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("some-"));
|
||||
EXPECT_TRUE(test == 0);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("tes"));
|
||||
EXPECT_TRUE(test == 0);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("tbla"));
|
||||
EXPECT_TRUE(test == 1);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 3);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalCustomRaw)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
Token custom;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("test1,")); break;
|
||||
case 2: EXPECT_TRUE(t.Equals(custom)); break;
|
||||
case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("!,,test3"));
|
||||
i.Rollback();
|
||||
i.SetTokenizingMode(Tokenizer::Mode::FULL);
|
||||
break;
|
||||
case 4: EXPECT_TRUE(t.Equals(Token::Char('!')));
|
||||
i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
|
||||
break;
|
||||
case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral(",,test3")); break;
|
||||
case 6: EXPECT_TRUE(t.Equals(custom)); break;
|
||||
case 7: EXPECT_TRUE(t.Fragment().EqualsLiteral("tes")); break;
|
||||
case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
});
|
||||
|
||||
custom = i.AddCustomToken("test2", Tokenizer::CASE_SENSITIVE);
|
||||
i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
|
||||
|
||||
NS_NAMED_LITERAL_CSTRING(input, "test1,test2!,,test3test2tes");
|
||||
auto cur = input.BeginReading();
|
||||
auto end = input.EndReading();
|
||||
for (; cur < end; ++cur) {
|
||||
i.FeedInput(nsDependentCSubstring(cur, 1));
|
||||
}
|
||||
|
||||
EXPECT_TRUE(test == 6);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 8);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalCustomRemove)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
Token custom;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Equals(custom));
|
||||
i.RemoveCustomToken(custom);
|
||||
break;
|
||||
case 2: EXPECT_FALSE(t.Equals(custom)); break;
|
||||
case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
});
|
||||
|
||||
custom = i.AddCustomToken("custom1", Tokenizer::CASE_SENSITIVE);
|
||||
|
||||
NS_NAMED_LITERAL_CSTRING(input, "custom1custom1");
|
||||
i.FeedInput(input);
|
||||
EXPECT_TRUE(test == 1);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 3);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalBuffering1)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
Token custom;
|
||||
nsDependentCSubstring observedFragment;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("012")); break;
|
||||
case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("3456789")); break;
|
||||
case 3: EXPECT_TRUE(t.Equals(custom)); break;
|
||||
case 4: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwe")); break;
|
||||
case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("rt")); break;
|
||||
case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
|
||||
observedFragment.Rebind(t.Fragment().BeginReading(),
|
||||
t.Fragment().Length());
|
||||
return NS_OK;
|
||||
}, nullptr, nullptr, 3);
|
||||
|
||||
custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
|
||||
// This externally unused token is added only to check the internal algorithm
|
||||
// does work correctly as expected when there are two different length tokens.
|
||||
Unused << i.AddCustomToken("bb", Tokenizer::CASE_SENSITIVE);
|
||||
i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
|
||||
|
||||
i.FeedInput(NS_LITERAL_CSTRING("01234"));
|
||||
EXPECT_TRUE(test == 1);
|
||||
EXPECT_TRUE(observedFragment.EqualsLiteral("012"));
|
||||
|
||||
i.FeedInput(NS_LITERAL_CSTRING("5"));
|
||||
EXPECT_TRUE(test == 1);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
|
||||
EXPECT_TRUE(test == 2);
|
||||
EXPECT_TRUE(observedFragment.EqualsLiteral("3456789"));
|
||||
|
||||
i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
|
||||
EXPECT_TRUE(test == 4);
|
||||
EXPECT_TRUE(observedFragment.EqualsLiteral("qwe"));
|
||||
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 6);
|
||||
}
|
||||
|
||||
TEST(Tokenizer, IncrementalBuffering2)
|
||||
{
|
||||
typedef TokenizerBase::Token Token;
|
||||
|
||||
int test = 0;
|
||||
Token custom;
|
||||
IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
|
||||
{
|
||||
switch (++test) {
|
||||
case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("01")); break;
|
||||
case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("234567")); break;
|
||||
case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("89")); break;
|
||||
case 4: EXPECT_TRUE(t.Equals(custom)); break;
|
||||
case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwert")); break;
|
||||
case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
|
||||
}
|
||||
return NS_OK;
|
||||
}, nullptr, nullptr, 3);
|
||||
|
||||
custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
|
||||
// This externally unused token is added only to check the internal algorithm
|
||||
// does work correctly as expected when there are two different length tokens.
|
||||
Unused << i.AddCustomToken("bbbbb", Tokenizer::CASE_SENSITIVE);
|
||||
i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
|
||||
|
||||
i.FeedInput(NS_LITERAL_CSTRING("01234"));
|
||||
EXPECT_TRUE(test == 0);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("5"));
|
||||
EXPECT_TRUE(test == 1);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
|
||||
EXPECT_TRUE(test == 2);
|
||||
i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
|
||||
EXPECT_TRUE(test == 4);
|
||||
i.FinishInput();
|
||||
EXPECT_TRUE(test == 6);
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче