From 3a39b6dbeea1c3dd65dce82774d2bd5a621b45b8 Mon Sep 17 00:00:00 2001 From: Honza Bambas Date: Mon, 6 Feb 2017 10:49:00 -0500 Subject: [PATCH] Bug 1322825 - Incremental tokenizer. r=froydnj --- xpcom/ds/IncrementalTokenizer.cpp | 195 ++++++++++++++ xpcom/ds/IncrementalTokenizer.h | 122 +++++++++ xpcom/ds/Tokenizer.cpp | 288 ++++++++++++++++---- xpcom/ds/Tokenizer.h | 188 +++++++++---- xpcom/ds/moz.build | 2 + xpcom/tests/gtest/TestTokenizer.cpp | 400 ++++++++++++++++++++++++++++ 6 files changed, 1101 insertions(+), 94 deletions(-) create mode 100644 xpcom/ds/IncrementalTokenizer.cpp create mode 100644 xpcom/ds/IncrementalTokenizer.h diff --git a/xpcom/ds/IncrementalTokenizer.cpp b/xpcom/ds/IncrementalTokenizer.cpp new file mode 100644 index 000000000000..429428516136 --- /dev/null +++ b/xpcom/ds/IncrementalTokenizer.cpp @@ -0,0 +1,195 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this +* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/IncrementalTokenizer.h" + +#include "mozilla/AutoRestore.h" + +#include "nsIInputStream.h" +#include "IncrementalTokenizer.h" +#include + +namespace mozilla { + +IncrementalTokenizer::IncrementalTokenizer(Consumer aConsumer, + const char * aWhitespaces, + const char * aAdditionalWordChars, + uint32_t aRawMinBuffered) + : TokenizerBase(aWhitespaces, aAdditionalWordChars) +#ifdef DEBUG + , mConsuming(false) +#endif + , mNeedMoreInput(false) + , mRollback(false) + , mInputCursor(0) + , mConsumer(aConsumer) +{ + mInputFinished = false; + mMinRawDelivery = aRawMinBuffered; +} + +nsresult IncrementalTokenizer::FeedInput(const nsACString & aInput) +{ + NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED); + MOZ_ASSERT(!mInputFinished); + + mInput.Cut(0, mInputCursor); + mInputCursor = 0; + + mInput.Append(aInput); + + return Process(); +} + +nsresult IncrementalTokenizer::FeedInput(nsIInputStream * aInput, uint32_t aCount) +{ + NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED); + MOZ_ASSERT(!mInputFinished); + MOZ_ASSERT(!mConsuming); + + mInput.Cut(0, mInputCursor); + mInputCursor = 0; + + nsresult rv = NS_OK; + while (NS_SUCCEEDED(rv) && aCount) { + nsCString::index_type remainder = mInput.Length(); + nsCString::index_type load = + std::min(aCount, PR_UINT32_MAX - remainder); + + if (!load) { + // To keep the API simple, we fail if the input data buffer if filled. + // It's highly unlikely there will ever be such amout of data cumulated + // unless a logic fault in the consumer code. + NS_ERROR("IncrementalTokenizer consumer not reading data?"); + return NS_ERROR_OUT_OF_MEMORY; + } + + if (!mInput.SetLength(remainder + load, fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + + nsCString::char_iterator buffer = mInput.BeginWriting() + remainder; + + uint32_t read; + rv = aInput->Read(buffer, load, &read); + if (NS_SUCCEEDED(rv)) { + // remainder + load fits the uint32_t size, so must remainder + read. + mInput.SetLength(remainder + read); + aCount -= read; + + rv = Process(); + } + } + + return rv; +} + +nsresult IncrementalTokenizer::FinishInput() +{ + NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED); + MOZ_ASSERT(!mInputFinished); + MOZ_ASSERT(!mConsuming); + + mInput.Cut(0, mInputCursor); + mInputCursor = 0; + + mInputFinished = true; + nsresult rv = Process(); + mConsumer = nullptr; + return rv; +} + +bool IncrementalTokenizer::Next(Token & aToken) +{ + // Assert we are called only from the consumer callback + MOZ_ASSERT(mConsuming); + + if (mPastEof) { + return false; + } + + nsACString::const_char_iterator next = Parse(aToken); + mPastEof = aToken.Type() == TOKEN_EOF; + if (next == mCursor && !mPastEof) { + // Not enough input to make a deterministic decision. + return false; + } + + AssignFragment(aToken, mCursor, next); + mCursor = next; + return true; +} + +void IncrementalTokenizer::NeedMoreInput() +{ + // Assert we are called only from the consumer callback + MOZ_ASSERT(mConsuming); + + // When the input has been finished, we can't set the flag to prevent + // indefinite wait for more input (that will never come) + mNeedMoreInput = !mInputFinished; +} + +void IncrementalTokenizer::Rollback() +{ + // Assert we are called only from the consumer callback + MOZ_ASSERT(mConsuming); + + mRollback = true; +} + +nsresult IncrementalTokenizer::Process() +{ +#ifdef DEBUG + // Assert we are not re-entered + MOZ_ASSERT(!mConsuming); + + AutoRestore consuming(mConsuming); + mConsuming = true; +#endif + + MOZ_ASSERT(!mPastEof); + + nsresult rv = NS_OK; + + mInput.BeginReading(mCursor); + mCursor += mInputCursor; + mInput.EndReading(mEnd); + + while (NS_SUCCEEDED(rv) && !mPastEof) { + Token token; + nsACString::const_char_iterator next = Parse(token); + mPastEof = token.Type() == TOKEN_EOF; + if (next == mCursor && !mPastEof) { + // Not enough input to make a deterministic decision. + break; + } + + AssignFragment(token, mCursor, next); + + nsACString::const_char_iterator rollback = mCursor; + mCursor = next; + + mNeedMoreInput = mRollback = false; + + rv = mConsumer(token, *this); + if (NS_FAILED(rv)) { + break; + } + if (mNeedMoreInput || mRollback) { + mCursor = rollback; + mPastEof = false; + if (mNeedMoreInput) { + break; + } + } + } + + mInputCursor = mCursor - mInput.BeginReading(); + return rv; +} + +} // mozilla diff --git a/xpcom/ds/IncrementalTokenizer.h b/xpcom/ds/IncrementalTokenizer.h new file mode 100644 index 000000000000..f93668e638a5 --- /dev/null +++ b/xpcom/ds/IncrementalTokenizer.h @@ -0,0 +1,122 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this +* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef INCREMENTAL_TOKENIZER_H__ +#define INCREMENTAL_TOKENIZER_H__ + +#include "mozilla/Tokenizer.h" + +#include "nsError.h" +#include + +class nsIInputStream; + +namespace mozilla { + +class IncrementalTokenizer : public TokenizerBase +{ +public: + /** + * The consumer callback. The function is called for every single token + * as found in the input. Failure result returned by this callback stops + * the tokenization immediately and bubbles to result of Feed/FinishInput. + * + * Fragment()s of consumed tokens are ensured to remain valid until next call to + * Feed/FinishInput and are pointing to a single linear buffer. Hence, those can + * be safely used to accumulate the data for processing after Feed/FinishInput + * returned. + */ + typedef std::function Consumer; + + /** + * For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase. + * + * @param aConsumer + * A mandatory non-null argument, a function that consumes the tokens as they + * come when the tokenizer is fed. + * @param aRawMinBuffered + * When we have buffered at least aRawMinBuffered data, but there was no custom + * token found so far because of too small incremental feed chunks, deliver + * the raw data to preserve streaming and to save memory. This only has effect + * in OnlyCustomTokenizing mode. + */ + explicit IncrementalTokenizer(Consumer aConsumer, + const char* aWhitespaces = nullptr, + const char* aAdditionalWordChars = nullptr, + uint32_t aRawMinBuffered = 1024); + + /** + * Pushes the input to be tokenized. These directly call the Consumer callback + * on every found token. Result of the Consumer callback is returned here. + * + * The tokenizer must be initialized with a valid consumer prior call to these + * methods. It's not allowed to call Feed/FinishInput from inside the Consumer + * callback. + */ + nsresult FeedInput(const nsACString& aInput); + nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount); + nsresult FinishInput(); + + /** + * Can only be called from inside the consumer callback. + * + * When there is still anything to read from the input, tokenize it, store + * the token type and value to aToken result and shift the cursor past this + * just parsed token. Each call to Next() reads another token from + * the input and shifts the cursor. + * + * Returns false if there is not enough data to deterministically recognize + * tokens or when the last returned token was EOF. + */ + MOZ_MUST_USE + bool Next(Token& aToken); + + /** + * Can only be called from inside the consumer callback. + * + * Tells the tokenizer to revert the cursor and stop the async parsing until + * next feed of the input. This is useful when more than one token is needed + * to decide on the syntax but there is not enough input to get a next token + * (Next() returned false.) + */ + void NeedMoreInput(); + + /** + * Can only be called from inside the consumer callback. + * + * This makes the consumer callback be called again while parsing + * the input at the previous cursor position again. This is useful when + * the tokenizer state (custom tokens, tokenization mode) has changed and + * we want to re-parse the input again. + */ + void Rollback(); + +private: + // Loops over the input with TokenizerBase::Parse and calls the Consumer callback. + nsresult Process(); + +#ifdef DEBUG + // True when inside the consumer callback, used only for assertions. + bool mConsuming; +#endif // DEBUG + // Modifyable only from the Consumer callback, tells the parser to break, rollback + // and wait for more input. + bool mNeedMoreInput; + // Modifyable only from the Consumer callback, tells the parser to rollback and + // parse the input again, with (if modified) new settings of the tokenizer. + bool mRollback; + // The input buffer. Updated with each call to Feed/FinishInput. + nsCString mInput; + // Numerical index pointing at the current cursor position. We don't keep direct + // reference to the string buffer since the buffer gets often reallocated. + nsCString::index_type mInputCursor; + // Refernce to the consumer function. + Consumer mConsumer; +}; + +} // mozilla + +#endif diff --git a/xpcom/ds/Tokenizer.cpp b/xpcom/ds/Tokenizer.cpp index 316232c16c2c..66cc1ebb77b9 100644 --- a/xpcom/ds/Tokenizer.cpp +++ b/xpcom/ds/Tokenizer.cpp @@ -7,6 +7,7 @@ #include "Tokenizer.h" #include "nsUnicharUtils.h" +#include namespace mozilla { @@ -15,11 +16,9 @@ static const char sWhitespaces[] = " \t"; Tokenizer::Tokenizer(const nsACString& aSource, const char* aWhitespaces, const char* aAdditionalWordChars) - : mPastEof(false) - , mHasFailed(false) - , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces) - , mAdditionalWordChars(aAdditionalWordChars) + : TokenizerBase(aWhitespaces, aAdditionalWordChars) { + mInputFinished = true; aSource.BeginReading(mCursor); mRecord = mRollback = mCursor; aSource.EndReading(mEnd); @@ -43,7 +42,7 @@ Tokenizer::Next(Token& aToken) mRollback = mCursor; mCursor = Parse(aToken); - aToken.AssignFragment(mRollback, mCursor); + AssignFragment(aToken, mRollback, mCursor); mPastEof = aToken.Type() == TOKEN_EOF; mHasFailed = false; @@ -67,7 +66,7 @@ Tokenizer::Check(const TokenType aTokenType, Token& aResult) mRollback = mCursor; mCursor = next; - aResult.AssignFragment(mRollback, mCursor); + AssignFragment(aResult, mRollback, mCursor); mPastEof = aResult.Type() == TOKEN_EOF; mHasFailed = false; @@ -96,12 +95,6 @@ Tokenizer::Check(const Token& aToken) return true; } -bool -Tokenizer::HasFailed() const -{ - return mHasFailed; -} - void Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines) { @@ -275,24 +268,156 @@ Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion) aResult.Rebind(mRecord, close - mRecord); } -// protected +// TokenizerBase + +TokenizerBase::TokenizerBase(const char* aWhitespaces, + const char* aAdditionalWordChars) + : mPastEof(false) + , mHasFailed(false) + , mInputFinished(true) + , mMode(Mode::FULL) + , mMinRawDelivery(1024) + , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces) + , mAdditionalWordChars(aAdditionalWordChars) + , mCursor(nullptr) + , mEnd(nullptr) + , mNextCustomTokenID(TOKEN_CUSTOM0) +{ +} + +TokenizerBase::Token +TokenizerBase::AddCustomToken(const nsACString & aValue, + ECaseSensitivity aCaseInsensitivity, bool aEnabled) +{ + MOZ_ASSERT(!aValue.IsEmpty()); + + UniquePtr& t = *mCustomTokens.AppendElement(); + t = MakeUnique(); + + t->mType = static_cast(++mNextCustomTokenID); + t->mCustomCaseInsensitivity = aCaseInsensitivity; + t->mCustomEnabled = aEnabled; + t->mCustom.Assign(aValue); + return *t; +} + +void +TokenizerBase::RemoveCustomToken(Token& aToken) +{ + if (aToken.mType == TOKEN_UNKNOWN) { + // Already removed + return; + } + + for (UniquePtr const& custom : mCustomTokens) { + if (custom->mType == aToken.mType) { + mCustomTokens.RemoveElement(custom); + aToken.mType = TOKEN_UNKNOWN; + return; + } + } + + MOZ_ASSERT(false, "Token to remove not found"); +} + +void +TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled) +{ + if (aToken.mType == TOKEN_UNKNOWN) { + // Already removed + return; + } + + for (UniquePtr const& custom : mCustomTokens) { + if (custom->Type() == aToken.Type()) { + // This effectively destroys the token instance. + custom->mCustomEnabled = aEnabled; + return; + } + } + + MOZ_ASSERT(false, "Token to change not found"); +} + +void +TokenizerBase::SetTokenizingMode(Mode aMode) +{ + mMode = aMode; +} bool -Tokenizer::HasInput() const +TokenizerBase::HasFailed() const +{ + return mHasFailed; +} + +bool +TokenizerBase::HasInput() const { return !mPastEof; } nsACString::const_char_iterator -Tokenizer::Parse(Token& aToken) const +TokenizerBase::Parse(Token& aToken) const { if (mCursor == mEnd) { + if (!mInputFinished) { + return mCursor; + } + aToken = Token::EndOfFile(); return mEnd; } + nsACString::size_type available = mEnd - mCursor; + + uint32_t longestCustom = 0; + for (UniquePtr const& custom : mCustomTokens) { + if (IsCustom(mCursor, *custom, &longestCustom)) { + aToken = *custom; + return mCursor + custom->mCustom.Length(); + } + } + + if (!mInputFinished && available < longestCustom) { + // Not enough data to deterministically decide. + return mCursor; + } + nsACString::const_char_iterator next = mCursor; + if (mMode == Mode::CUSTOM_ONLY) { + // We have to do a brute-force search for all of the enabled custom + // tokens. + while (next < mEnd) { + ++next; + for (UniquePtr const& custom : mCustomTokens) { + if (IsCustom(next, *custom)) { + aToken = Token::Raw(); + return next; + } + } + } + + if (mInputFinished) { + // End of the data reached. + aToken = Token::Raw(); + return next; + } + + if (longestCustom < available && available > mMinRawDelivery) { + // We can return some data w/o waiting for either a custom token + // or call to FinishData() when we leave the tail where all the + // custom tokens potentially fit, so we can't lose only partially + // delivered tokens. This preserves reasonable granularity. + aToken = Token::Raw(); + return mEnd - longestCustom + 1; + } + + // Not enough data to deterministically decide. + return mCursor; + } + enum State { PARSE_INTEGER, PARSE_WORD, @@ -326,6 +451,9 @@ Tokenizer::Parse(Token& aToken) const resultingNumber += static_cast(*next - '0'); ++next; + if (IsPending(next)) { + break; + } if (IsEnd(next) || !IsNumber(*next)) { if (!resultingNumber.isValid()) { aToken = Token::Error(); @@ -338,6 +466,9 @@ Tokenizer::Parse(Token& aToken) const case PARSE_WORD: ++next; + if (IsPending(next)) { + break; + } if (IsEnd(next) || !IsWord(*next)) { aToken = Token::Word(Substring(mCursor, next)); return next; @@ -346,6 +477,9 @@ Tokenizer::Parse(Token& aToken) const case PARSE_CRLF: ++next; + if (IsPending(next)) { + break; + } if (!IsEnd(next) && *next == '\n') { // LF is optional ++next; } @@ -369,17 +503,24 @@ Tokenizer::Parse(Token& aToken) const } // switch (state) } // while (next < end) - return next; + MOZ_ASSERT(!mInputFinished); + return mCursor; } bool -Tokenizer::IsEnd(const nsACString::const_char_iterator& caret) const +TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const { return caret == mEnd; } bool -Tokenizer::IsWordFirst(const char aInput) const +TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const +{ + return IsEnd(caret) && !mInputFinished; +} + +bool +TokenizerBase::IsWordFirst(const char aInput) const { // TODO: make this fully work with unicode return (ToLowerCase(static_cast(aInput)) != @@ -389,50 +530,107 @@ Tokenizer::IsWordFirst(const char aInput) const } bool -Tokenizer::IsWord(const char aInput) const +TokenizerBase::IsWord(const char aInput) const { return IsWordFirst(aInput) || IsNumber(aInput); } bool -Tokenizer::IsNumber(const char aInput) const +TokenizerBase::IsNumber(const char aInput) const { // TODO: are there unicode numbers? return aInput >= '0' && aInput <= '9'; } -// Tokenizer::Token +bool +TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret, + const Token & aCustomToken, + uint32_t * aLongest) const +{ + MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0); + if (!aCustomToken.mCustomEnabled) { + return false; + } -Tokenizer::Token::Token(const Token& aOther) + if (aLongest) { + *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length()); + } + + uint32_t inputLength = mEnd - caret; + if (aCustomToken.mCustom.Length() > inputLength) { + return false; + } + + nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length()); + if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) { + return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator()); + } + return inputFragment.Equals(aCustomToken.mCustom); +} + +void TokenizerBase::AssignFragment(Token& aToken, + nsACString::const_char_iterator begin, + nsACString::const_char_iterator end) +{ + aToken.AssignFragment(begin, end); +} + +// TokenizerBase::Token + +TokenizerBase::Token::Token() + : mType(TOKEN_UNKNOWN) + , mChar(0) + , mInteger(0) + , mCustomCaseInsensitivity(CASE_SENSITIVE) + , mCustomEnabled(false) +{ +} + +TokenizerBase::Token::Token(const Token& aOther) : mType(aOther.mType) + , mCustom(aOther.mCustom) , mChar(aOther.mChar) , mInteger(aOther.mInteger) + , mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity) + , mCustomEnabled(aOther.mCustomEnabled) { - if (mType == TOKEN_WORD) { + if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) { mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length()); } } -Tokenizer::Token& -Tokenizer::Token::operator=(const Token& aOther) +TokenizerBase::Token& +TokenizerBase::Token::operator=(const Token& aOther) { mType = aOther.mType; + mCustom = aOther.mCustom; mChar = aOther.mChar; mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length()); mInteger = aOther.mInteger; + mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity; + mCustomEnabled = aOther.mCustomEnabled; return *this; } void -Tokenizer::Token::AssignFragment(nsACString::const_char_iterator begin, - nsACString::const_char_iterator end) +TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin, + nsACString::const_char_iterator end) { mFragment.Rebind(begin, end - begin); } // static -Tokenizer::Token -Tokenizer::Token::Word(const nsACString& aValue) +TokenizerBase::Token +TokenizerBase::Token::Raw() +{ + Token t; + t.mType = TOKEN_RAW; + return t; +} + +// static +TokenizerBase::Token +TokenizerBase::Token::Word(const nsACString& aValue) { Token t; t.mType = TOKEN_WORD; @@ -441,8 +639,8 @@ Tokenizer::Token::Word(const nsACString& aValue) } // static -Tokenizer::Token -Tokenizer::Token::Char(const char aValue) +TokenizerBase::Token +TokenizerBase::Token::Char(const char aValue) { Token t; t.mType = TOKEN_CHAR; @@ -451,8 +649,8 @@ Tokenizer::Token::Char(const char aValue) } // static -Tokenizer::Token -Tokenizer::Token::Number(const uint64_t aValue) +TokenizerBase::Token +TokenizerBase::Token::Number(const uint64_t aValue) { Token t; t.mType = TOKEN_INTEGER; @@ -461,8 +659,8 @@ Tokenizer::Token::Number(const uint64_t aValue) } // static -Tokenizer::Token -Tokenizer::Token::Whitespace() +TokenizerBase::Token +TokenizerBase::Token::Whitespace() { Token t; t.mType = TOKEN_WS; @@ -471,8 +669,8 @@ Tokenizer::Token::Whitespace() } // static -Tokenizer::Token -Tokenizer::Token::NewLine() +TokenizerBase::Token +TokenizerBase::Token::NewLine() { Token t; t.mType = TOKEN_EOL; @@ -480,8 +678,8 @@ Tokenizer::Token::NewLine() } // static -Tokenizer::Token -Tokenizer::Token::EndOfFile() +TokenizerBase::Token +TokenizerBase::Token::EndOfFile() { Token t; t.mType = TOKEN_EOF; @@ -489,8 +687,8 @@ Tokenizer::Token::EndOfFile() } // static -Tokenizer::Token -Tokenizer::Token::Error() +TokenizerBase::Token +TokenizerBase::Token::Error() { Token t; t.mType = TOKEN_ERROR; @@ -498,7 +696,7 @@ Tokenizer::Token::Error() } bool -Tokenizer::Token::Equals(const Token& aOther) const +TokenizerBase::Token::Equals(const Token& aOther) const { if (mType != aOther.mType) { return false; @@ -517,21 +715,21 @@ Tokenizer::Token::Equals(const Token& aOther) const } char -Tokenizer::Token::AsChar() const +TokenizerBase::Token::AsChar() const { MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS); return mChar; } nsDependentCSubstring -Tokenizer::Token::AsString() const +TokenizerBase::Token::AsString() const { MOZ_ASSERT(mType == TOKEN_WORD); return mWord; } uint64_t -Tokenizer::Token::AsInteger() const +TokenizerBase::Token::AsInteger() const { MOZ_ASSERT(mType == TOKEN_INTEGER); return mInteger; diff --git a/xpcom/ds/Tokenizer.h b/xpcom/ds/Tokenizer.h index b13041f67877..b4aad9ed995f 100644 --- a/xpcom/ds/Tokenizer.h +++ b/xpcom/ds/Tokenizer.h @@ -9,32 +9,36 @@ #include "nsString.h" #include "mozilla/CheckedInt.h" +#include "mozilla/UniquePtr.h" +#include "nsTArray.h" namespace mozilla { -/** - * This is a simple implementation of a lexical analyzer or maybe better - * called a tokenizer. It doesn't allow any user dictionaries or - * user define token types. - * - * It is limited only to ASCII input for now. UTF-8 or any other input - * encoding must yet be implemented. - */ -class Tokenizer { +class TokenizerBase +{ public: /** * The analyzer works with elements in the input cut to a sequence of token * where each token has an elementary type */ - enum TokenType { + enum TokenType : uint32_t + { TOKEN_UNKNOWN, + TOKEN_RAW, TOKEN_ERROR, TOKEN_INTEGER, TOKEN_WORD, TOKEN_CHAR, TOKEN_WS, TOKEN_EOL, - TOKEN_EOF + TOKEN_EOF, + TOKEN_CUSTOM0 = 1000 + }; + + enum ECaseSensitivity + { + CASE_SENSITIVE, + CASE_INSENSITIVE }; /** @@ -42,23 +46,29 @@ public: * to allow checks against it via methods of Tokenizer or are results of some of * the Tokenizer's methods. */ - class Token { + class Token + { TokenType mType; nsDependentCSubstring mWord; + nsCString mCustom; char mChar; uint64_t mInteger; + ECaseSensitivity mCustomCaseInsensitivity; + bool mCustomEnabled; // If this token is a result of the parsing process, this member is referencing // a sub-string in the input buffer. If this is externally created Token this // member is left an empty string. nsDependentCSubstring mFragment; - friend class Tokenizer; + friend class TokenizerBase; void AssignFragment(nsACString::const_char_iterator begin, nsACString::const_char_iterator end); + static Token Raw(); + public: - Token() : mType(TOKEN_UNKNOWN), mChar(0), mInteger(0) {} + Token(); Token(const Token& aOther); Token& operator=(const Token& aOther); @@ -83,6 +93,120 @@ public: nsDependentCSubstring Fragment() const { return mFragment; } }; + /** + * Consumers may register a custom string that, when found in the input, is considered + * a token and returned by Next*() and accepted by Check*() methods. + * AddCustomToken() returns a reference to a token that can then be comapred using + * Token::Equals() againts the output from Next*() or be passed to Check*(). + */ + Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true); + template + Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true) + { + return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled); + } + void RemoveCustomToken(Token& aToken); + /** + * Only applies to a custom type of a Token (see AddCustomToken above.) + * This turns on and off token recognition. When a custom token is disabled, + * it's ignored as never added as a custom token. + */ + void EnableCustomToken(Token const& aToken, bool aEnable); + + /** + * Mode of tokenization. + * FULL tokenization, the default, recognizes built-in tokens and any custom tokens, + * if added. + * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'. + * This mode can be understood as a 'binary' mode. + */ + enum class Mode + { + FULL, + CUSTOM_ONLY + }; + void SetTokenizingMode(Mode aMode); + + /** + * Return false iff the last Check*() call has returned false or when we've read past + * the end of the input string. + */ + MOZ_MUST_USE bool HasFailed() const; + +protected: + explicit TokenizerBase(const char* aWhitespaces = nullptr, + const char* aAdditionalWordChars = nullptr); + + // false if we have already read the EOF token. + bool HasInput() const; + // Main parsing function, it doesn't shift the read cursor, just returns the next + // token position. + nsACString::const_char_iterator Parse(Token& aToken) const; + // Is read cursor at the end? + bool IsEnd(const nsACString::const_char_iterator& caret) const; + // True, when we are at the end of the input data, but it has not been marked + // as complete yet. In that case we cannot proceed with providing a multi-char token. + bool IsPending(const nsACString::const_char_iterator & caret) const; + // Is read cursor on a character that is a word start? + bool IsWordFirst(const char aInput) const; + // Is read cursor on a character that is an in-word letter? + bool IsWord(const char aInput) const; + // Is read cursor on a character that is a valid number? + // TODO - support multiple radix + bool IsNumber(const char aInput) const; + // Is equal to the given custom token? + bool IsCustom(const nsACString::const_char_iterator& caret, + const Token& aCustomToken, uint32_t* aLongest = nullptr) const; + + // Friendly helper to assign a fragment on a Token + static void AssignFragment(Token& aToken, + nsACString::const_char_iterator begin, + nsACString::const_char_iterator end); + + // true iff we have already read the EOF token + bool mPastEof; + // true iff the last Check*() call has returned false, reverts to true on Rollback() call + bool mHasFailed; + // true if the input string is final (finished), false when we expect more data + // yet to be fed to the tokenizer (see IncrementalTokenizer derived class). + bool mInputFinished; + // custom only vs full tokenizing mode, see the Parse() method + Mode mMode; + // minimal raw data chunked delivery during incremental feed + uint32_t mMinRawDelivery; + + // Customizable list of whitespaces + const char* mWhitespaces; + // Additinal custom word characters + const char* mAdditionalWordChars; + + // All these point to the original buffer passed to the constructor or to the incremental + // buffer after FeedInput. + nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start + nsACString::const_char_iterator mEnd; // End of the input position + + // This is the list of tokens user has registered with AddCustomToken() + nsTArray> mCustomTokens; + uint32_t mNextCustomTokenID; + +private: + TokenizerBase() = delete; + TokenizerBase(const TokenizerBase&) = delete; + TokenizerBase(TokenizerBase&&) = delete; + TokenizerBase(const TokenizerBase&&) = delete; + TokenizerBase &operator=(const TokenizerBase&) = delete; +}; + +/** + * This is a simple implementation of a lexical analyzer or maybe better + * called a tokenizer. It doesn't allow any user dictionaries or + * user define token types. + * + * It is limited only to ASCII input for now. UTF-8 or any other input + * encoding must yet be implemented. + */ +class Tokenizer : public TokenizerBase +{ public: /** * @param aSource @@ -133,13 +257,6 @@ public: MOZ_MUST_USE bool Check(const Token& aToken); - /** - * Return false iff the last Check*() call has returned false or when we've read past - * the end of the input string. - */ - MOZ_MUST_USE - bool HasFailed() const; - /** * SkipWhites method (below) may also skip new line characters automatically. */ @@ -312,36 +429,9 @@ public: ClaimInclusion aInclude = EXCLUDE_LAST); protected: - // false if we have already read the EOF token. - bool HasInput() const; - // Main parsing function, it doesn't shift the read cursor, just returns the next - // token position. - nsACString::const_char_iterator Parse(Token& aToken) const; - // Is read cursor at the end? - bool IsEnd(const nsACString::const_char_iterator& caret) const; - // Is read cursor on a character that is a word start? - bool IsWordFirst(const char aInput) const; - // Is read cursor on a character that is an in-word letter? - bool IsWord(const char aInput) const; - // Is read cursor on a character that is a valid number? - // TODO - support multiple radix - bool IsNumber(const char aInput) const; - - // true iff we have already read the EOF token - bool mPastEof; - // true iff the last Check*() call has returned false, reverts to true on Rollback() call - bool mHasFailed; - - // Customizable list of whitespaces - const char* mWhitespaces; - // Additinal custom word characters - const char* mAdditionalWordChars; - - // All these point to the original buffer passed to the Tokenizer + // All these point to the original buffer passed to the Tokenizer's constructor nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is nsACString::const_char_iterator mRollback; // Position of the previous token start - nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start - nsACString::const_char_iterator mEnd; // End of the input position private: Tokenizer() = delete; diff --git a/xpcom/ds/moz.build b/xpcom/ds/moz.build index 88bbbdc2fa00..b699fc74cd37 100644 --- a/xpcom/ds/moz.build +++ b/xpcom/ds/moz.build @@ -83,12 +83,14 @@ EXPORTS += [ ] EXPORTS.mozilla += [ + 'IncrementalTokenizer.h', 'Observer.h', 'StickyTimeDuration.h', 'Tokenizer.h', ] UNIFIED_SOURCES += [ + 'IncrementalTokenizer.cpp', 'nsArray.cpp', 'nsArrayEnumerator.cpp', 'nsArrayUtils.cpp', diff --git a/xpcom/tests/gtest/TestTokenizer.cpp b/xpcom/tests/gtest/TestTokenizer.cpp index b2a56acfd16c..283bbd3b8c19 100644 --- a/xpcom/tests/gtest/TestTokenizer.cpp +++ b/xpcom/tests/gtest/TestTokenizer.cpp @@ -5,6 +5,8 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "mozilla/Tokenizer.h" +#include "mozilla/IncrementalTokenizer.h" +#include "mozilla/Unused.h" #include "gtest/gtest.h" using namespace mozilla; @@ -732,3 +734,401 @@ TEST(Tokenizer, SkipUntil) EXPECT_TRUE(p.CheckEOF()); } } + +TEST(Tokenizer, Custom) +{ + Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2"); + + Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE); + Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE); + + // It's expected to NOT FIND the custom token if it's not on an edge + // between other recognizable tokens. + EXPECT_TRUE(p.CheckWord("aaaaaacustom")); + EXPECT_TRUE(p.CheckChar('-')); + EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1))); + EXPECT_TRUE(p.CheckEOL()); + EXPECT_TRUE(p.CheckChar(',')); + + EXPECT_TRUE(p.Check(c1)); + EXPECT_TRUE(p.CheckChar(',')); + + EXPECT_TRUE(p.Check(c1)); + EXPECT_TRUE(p.CheckChar(',')); + + p.EnableCustomToken(c1, false); + EXPECT_TRUE(p.CheckWord("Custom")); + EXPECT_TRUE(p.CheckChar('-')); + EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1))); + EXPECT_TRUE(p.CheckChar(',')); + + EXPECT_TRUE(p.Check(Tokenizer::Token::Number(0))); + EXPECT_TRUE(p.Check(c2)); + EXPECT_TRUE(p.CheckWord("xxxx")); + EXPECT_TRUE(p.CheckChar(',')); + + EXPECT_TRUE(p.CheckWord("CUSTOM")); + EXPECT_TRUE(p.CheckChar('-')); + EXPECT_TRUE(p.Check(Tokenizer::Token::Number(2))); + + EXPECT_TRUE(p.CheckEOF()); +} + +TEST(Tokenizer, CustomRaw) +{ + Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2"); + + Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE); + Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE); + + // In this mode it's expected to find all custom tokens among any kind of input. + p.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY); + + Tokenizer::Token t; + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral("aaaaaa")); + + EXPECT_TRUE(p.Check(c1)); + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral("\r,")); + + EXPECT_TRUE(p.Check(c1)); + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral(",")); + + EXPECT_TRUE(p.Check(c1)); + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral(",")); + + EXPECT_TRUE(p.Check(c1)); + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral(",00")); + + EXPECT_TRUE(p.Check(c2)); + + EXPECT_TRUE(p.Next(t)); + EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW); + EXPECT_TRUE(t.Fragment().EqualsLiteral("xxxx,CUSTOM-2")); + + EXPECT_TRUE(p.CheckEOF()); +} + +TEST(Tokenizer, Incremental) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break; + case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break; + case 4: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 7: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break; + case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + return NS_OK; + }); + + NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3"); + auto cur = input.BeginReading(); + auto end = input.EndReading(); + for (; cur < end; ++cur) { + i.FeedInput(nsDependentCSubstring(cur, 1)); + } + + EXPECT_TRUE(test == 6); + i.FinishInput(); + EXPECT_TRUE(test == 8); +} + +TEST(Tokenizer, IncrementalRollback) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break; + case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); + i.Rollback(); // so that we get the token again + break; + case 4: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break; + case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 7: EXPECT_TRUE(t.Equals(Token::Char(','))); break; + case 8: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break; + case 9: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + return NS_OK; + }); + + NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3"); + auto cur = input.BeginReading(); + auto end = input.EndReading(); + for (; cur < end; ++cur) { + i.FeedInput(nsDependentCSubstring(cur, 1)); + } + + EXPECT_TRUE(test == 7); + i.FinishInput(); + EXPECT_TRUE(test == 9); +} + +TEST(Tokenizer, IncrementalNeedMoreInput) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + Token t2; + switch (++test) { + case 1: + EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("a")))); + break; + case 2: + case 3: + case 4: + case 5: + EXPECT_TRUE(t.Equals(Token::Whitespace())); + if (i.Next(t2)) { + EXPECT_TRUE(test == 5); + EXPECT_TRUE(t2.Equals(Token::Word(NS_LITERAL_CSTRING("bb")))); + } else { + EXPECT_TRUE(test < 5); + i.NeedMoreInput(); + } + break; + case 6: + EXPECT_TRUE(t.Equals(Token::Char(','))); + break; + case 7: + EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("c")))); + return NS_ERROR_FAILURE; + default: + EXPECT_TRUE(false); + break; + } + + return NS_OK; + }); + + NS_NAMED_LITERAL_CSTRING(input, "a bb,c"); + auto cur = input.BeginReading(); + auto end = input.EndReading(); + + nsresult rv; + for (; cur < end; ++cur) { + rv = i.FeedInput(nsDependentCSubstring(cur, 1)); + if (NS_FAILED(rv)) { + break; + } + } + + EXPECT_TRUE(rv == NS_OK); + EXPECT_TRUE(test == 6); + + rv = i.FinishInput(); + EXPECT_TRUE(rv == NS_ERROR_FAILURE); + EXPECT_TRUE(test == 7); +} + +TEST(Tokenizer, IncrementalCustom) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + Token custom; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Equals(custom)); break; + case 2: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("bla")))); break; + case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + return NS_OK; + }, nullptr, "-"); + + custom = i.AddCustomToken("some-test", Tokenizer::CASE_SENSITIVE); + i.FeedInput(NS_LITERAL_CSTRING("some-")); + EXPECT_TRUE(test == 0); + i.FeedInput(NS_LITERAL_CSTRING("tes")); + EXPECT_TRUE(test == 0); + i.FeedInput(NS_LITERAL_CSTRING("tbla")); + EXPECT_TRUE(test == 1); + i.FinishInput(); + EXPECT_TRUE(test == 3); +} + +TEST(Tokenizer, IncrementalCustomRaw) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + Token custom; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("test1,")); break; + case 2: EXPECT_TRUE(t.Equals(custom)); break; + case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("!,,test3")); + i.Rollback(); + i.SetTokenizingMode(Tokenizer::Mode::FULL); + break; + case 4: EXPECT_TRUE(t.Equals(Token::Char('!'))); + i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY); + break; + case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral(",,test3")); break; + case 6: EXPECT_TRUE(t.Equals(custom)); break; + case 7: EXPECT_TRUE(t.Fragment().EqualsLiteral("tes")); break; + case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + return NS_OK; + }); + + custom = i.AddCustomToken("test2", Tokenizer::CASE_SENSITIVE); + i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY); + + NS_NAMED_LITERAL_CSTRING(input, "test1,test2!,,test3test2tes"); + auto cur = input.BeginReading(); + auto end = input.EndReading(); + for (; cur < end; ++cur) { + i.FeedInput(nsDependentCSubstring(cur, 1)); + } + + EXPECT_TRUE(test == 6); + i.FinishInput(); + EXPECT_TRUE(test == 8); +} + +TEST(Tokenizer, IncrementalCustomRemove) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + Token custom; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Equals(custom)); + i.RemoveCustomToken(custom); + break; + case 2: EXPECT_FALSE(t.Equals(custom)); break; + case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + return NS_OK; + }); + + custom = i.AddCustomToken("custom1", Tokenizer::CASE_SENSITIVE); + + NS_NAMED_LITERAL_CSTRING(input, "custom1custom1"); + i.FeedInput(input); + EXPECT_TRUE(test == 1); + i.FinishInput(); + EXPECT_TRUE(test == 3); +} + +TEST(Tokenizer, IncrementalBuffering1) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + Token custom; + nsDependentCSubstring observedFragment; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("012")); break; + case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("3456789")); break; + case 3: EXPECT_TRUE(t.Equals(custom)); break; + case 4: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwe")); break; + case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("rt")); break; + case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + + observedFragment.Rebind(t.Fragment().BeginReading(), + t.Fragment().Length()); + return NS_OK; + }, nullptr, nullptr, 3); + + custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE); + // This externally unused token is added only to check the internal algorithm + // does work correctly as expected when there are two different length tokens. + Unused << i.AddCustomToken("bb", Tokenizer::CASE_SENSITIVE); + i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY); + + i.FeedInput(NS_LITERAL_CSTRING("01234")); + EXPECT_TRUE(test == 1); + EXPECT_TRUE(observedFragment.EqualsLiteral("012")); + + i.FeedInput(NS_LITERAL_CSTRING("5")); + EXPECT_TRUE(test == 1); + i.FeedInput(NS_LITERAL_CSTRING("6789aa")); + EXPECT_TRUE(test == 2); + EXPECT_TRUE(observedFragment.EqualsLiteral("3456789")); + + i.FeedInput(NS_LITERAL_CSTRING("aqwert")); + EXPECT_TRUE(test == 4); + EXPECT_TRUE(observedFragment.EqualsLiteral("qwe")); + + i.FinishInput(); + EXPECT_TRUE(test == 6); +} + +TEST(Tokenizer, IncrementalBuffering2) +{ + typedef TokenizerBase::Token Token; + + int test = 0; + Token custom; + IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult + { + switch (++test) { + case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("01")); break; + case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("234567")); break; + case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("89")); break; + case 4: EXPECT_TRUE(t.Equals(custom)); break; + case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwert")); break; + case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break; + } + return NS_OK; + }, nullptr, nullptr, 3); + + custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE); + // This externally unused token is added only to check the internal algorithm + // does work correctly as expected when there are two different length tokens. + Unused << i.AddCustomToken("bbbbb", Tokenizer::CASE_SENSITIVE); + i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY); + + i.FeedInput(NS_LITERAL_CSTRING("01234")); + EXPECT_TRUE(test == 0); + i.FeedInput(NS_LITERAL_CSTRING("5")); + EXPECT_TRUE(test == 1); + i.FeedInput(NS_LITERAL_CSTRING("6789aa")); + EXPECT_TRUE(test == 2); + i.FeedInput(NS_LITERAL_CSTRING("aqwert")); + EXPECT_TRUE(test == 4); + i.FinishInput(); + EXPECT_TRUE(test == 6); +}