Bug 1322825 - Incremental tokenizer. r=froydnj

2017-02-06 10:49:00 -05:00 · 2017-02-06 10:49:00 -05:00 · 3a39b6dbee
--- a/xpcom/ds/IncrementalTokenizer.cpp
+++ b/xpcom/ds/IncrementalTokenizer.cpp
@ -0,0 +1,195 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/IncrementalTokenizer.h"
+
+#include "mozilla/AutoRestore.h"
+
+#include "nsIInputStream.h"
+#include "IncrementalTokenizer.h"
+#include <algorithm>
+
+namespace mozilla {
+
+IncrementalTokenizer::IncrementalTokenizer(Consumer aConsumer,
+                                           const char * aWhitespaces,
+                                           const char * aAdditionalWordChars,
+                                           uint32_t aRawMinBuffered)
+  : TokenizerBase(aWhitespaces, aAdditionalWordChars)
+#ifdef DEBUG
+  , mConsuming(false)
+#endif
+  , mNeedMoreInput(false)
+  , mRollback(false)
+  , mInputCursor(0)
+  , mConsumer(aConsumer)
+{
+  mInputFinished = false;
+  mMinRawDelivery = aRawMinBuffered;
+}
+
+nsresult IncrementalTokenizer::FeedInput(const nsACString & aInput)
+{
+  NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+  MOZ_ASSERT(!mInputFinished);
+
+  mInput.Cut(0, mInputCursor);
+  mInputCursor = 0;
+
+  mInput.Append(aInput);
+
+  return Process();
+}
+
+nsresult IncrementalTokenizer::FeedInput(nsIInputStream * aInput, uint32_t aCount)
+{
+  NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+  MOZ_ASSERT(!mInputFinished);
+  MOZ_ASSERT(!mConsuming);
+
+  mInput.Cut(0, mInputCursor);
+  mInputCursor = 0;
+
+  nsresult rv = NS_OK;
+  while (NS_SUCCEEDED(rv) && aCount) {
+    nsCString::index_type remainder = mInput.Length();
+    nsCString::index_type load =
+      std::min<nsCString::index_type>(aCount, PR_UINT32_MAX - remainder);
+
+    if (!load) {
+      // To keep the API simple, we fail if the input data buffer if filled.
+      // It's highly unlikely there will ever be such amout of data cumulated
+      // unless a logic fault in the consumer code.
+      NS_ERROR("IncrementalTokenizer consumer not reading data?");
+      return NS_ERROR_OUT_OF_MEMORY;
+    }
+
+    if (!mInput.SetLength(remainder + load, fallible)) {
+      return NS_ERROR_OUT_OF_MEMORY;
+    }
+
+    nsCString::char_iterator buffer = mInput.BeginWriting() + remainder;
+
+    uint32_t read;
+    rv = aInput->Read(buffer, load, &read);
+    if (NS_SUCCEEDED(rv)) {
+      // remainder + load fits the uint32_t size, so must remainder + read.
+      mInput.SetLength(remainder + read);
+      aCount -= read;
+
+      rv = Process();
+    }
+  }
+
+  return rv;
+}
+
+nsresult IncrementalTokenizer::FinishInput()
+{
+  NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+  MOZ_ASSERT(!mInputFinished);
+  MOZ_ASSERT(!mConsuming);
+
+  mInput.Cut(0, mInputCursor);
+  mInputCursor = 0;
+
+  mInputFinished = true;
+  nsresult rv = Process();
+  mConsumer = nullptr;
+  return rv;
+}
+
+bool IncrementalTokenizer::Next(Token & aToken)
+{
+  // Assert we are called only from the consumer callback
+  MOZ_ASSERT(mConsuming);
+
+  if (mPastEof) {
+    return false;
+  }
+
+  nsACString::const_char_iterator next = Parse(aToken);
+  mPastEof = aToken.Type() == TOKEN_EOF;
+  if (next == mCursor && !mPastEof) {
+    // Not enough input to make a deterministic decision.
+    return false;
+  }
+
+  AssignFragment(aToken, mCursor, next);
+  mCursor = next;
+  return true;
+}
+
+void IncrementalTokenizer::NeedMoreInput()
+{
+  // Assert we are called only from the consumer callback
+  MOZ_ASSERT(mConsuming);
+
+  // When the input has been finished, we can't set the flag to prevent
+  // indefinite wait for more input (that will never come)
+  mNeedMoreInput = !mInputFinished;
+}
+
+void IncrementalTokenizer::Rollback()
+{
+  // Assert we are called only from the consumer callback
+  MOZ_ASSERT(mConsuming);
+
+  mRollback = true;
+}
+
+nsresult IncrementalTokenizer::Process()
+{
+#ifdef DEBUG
+  // Assert we are not re-entered
+  MOZ_ASSERT(!mConsuming);
+
+  AutoRestore<bool> consuming(mConsuming);
+  mConsuming = true;
+#endif
+
+  MOZ_ASSERT(!mPastEof);
+
+  nsresult rv = NS_OK;
+
+  mInput.BeginReading(mCursor);
+  mCursor += mInputCursor;
+  mInput.EndReading(mEnd);
+
+  while (NS_SUCCEEDED(rv) && !mPastEof) {
+    Token token;
+    nsACString::const_char_iterator next = Parse(token);
+    mPastEof = token.Type() == TOKEN_EOF;
+    if (next == mCursor && !mPastEof) {
+      // Not enough input to make a deterministic decision.
+      break;
+    }
+
+    AssignFragment(token, mCursor, next);
+
+    nsACString::const_char_iterator rollback = mCursor;
+    mCursor = next;
+
+    mNeedMoreInput = mRollback = false;
+
+    rv = mConsumer(token, *this);
+    if (NS_FAILED(rv)) {
+      break;
+    }
+    if (mNeedMoreInput || mRollback) {
+      mCursor = rollback;
+      mPastEof = false;
+      if (mNeedMoreInput) {
+        break;
+      }
+    }
+  }
+
+  mInputCursor = mCursor - mInput.BeginReading();
+  return rv;
+}
+
+} // mozilla
--- a/xpcom/ds/IncrementalTokenizer.h
+++ b/xpcom/ds/IncrementalTokenizer.h
@ -0,0 +1,122 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef INCREMENTAL_TOKENIZER_H__
+#define INCREMENTAL_TOKENIZER_H__
+
+#include "mozilla/Tokenizer.h"
+
+#include "nsError.h"
+#include <functional>
+
+class nsIInputStream;
+
+namespace mozilla {
+
+class IncrementalTokenizer : public TokenizerBase
+{
+public:
+  /**
+   * The consumer callback.  The function is called for every single token
+   * as found in the input.  Failure result returned by this callback stops
+   * the tokenization immediately and bubbles to result of Feed/FinishInput.
+   *
+   * Fragment()s of consumed tokens are ensured to remain valid until next call to
+   * Feed/FinishInput and are pointing to a single linear buffer.  Hence, those can
+   * be safely used to accumulate the data for processing after Feed/FinishInput
+   * returned.
+   */
+  typedef std::function<nsresult(Token const&, IncrementalTokenizer& i)> Consumer;
+
+  /**
+   * For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase.
+   *
+   * @param aConsumer
+   *    A mandatory non-null argument, a function that consumes the tokens as they
+   *    come when the tokenizer is fed.
+   * @param aRawMinBuffered
+   *    When we have buffered at least aRawMinBuffered data, but there was no custom
+   *    token found so far because of too small incremental feed chunks, deliver
+   *    the raw data to preserve streaming and to save memory.  This only has effect
+   *    in OnlyCustomTokenizing mode.
+   */
+  explicit IncrementalTokenizer(Consumer aConsumer,
+                                const char* aWhitespaces = nullptr,
+                                const char* aAdditionalWordChars = nullptr,
+                                uint32_t aRawMinBuffered = 1024);
+
+  /**
+   * Pushes the input to be tokenized.  These directly call the Consumer callback
+   * on every found token.  Result of the Consumer callback is returned here.
+   *
+   * The tokenizer must be initialized with a valid consumer prior call to these
+   * methods.  It's not allowed to call Feed/FinishInput from inside the Consumer
+   * callback.
+   */
+  nsresult FeedInput(const nsACString& aInput);
+  nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount);
+  nsresult FinishInput();
+
+  /**
+   * Can only be called from inside the consumer callback.
+   *
+   * When there is still anything to read from the input, tokenize it, store
+   * the token type and value to aToken result and shift the cursor past this
+   * just parsed token.  Each call to Next() reads another token from
+   * the input and shifts the cursor.
+   *
+   * Returns false if there is not enough data to deterministically recognize
+   * tokens or when the last returned token was EOF.
+   */
+  MOZ_MUST_USE
+  bool Next(Token& aToken);
+
+  /**
+   * Can only be called from inside the consumer callback.
+   *
+   * Tells the tokenizer to revert the cursor and stop the async parsing until
+   * next feed of the input.  This is useful when more than one token is needed
+   * to decide on the syntax but there is not enough input to get a next token
+   * (Next() returned false.)
+   */
+  void NeedMoreInput();
+
+  /**
+   * Can only be called from inside the consumer callback.
+   *
+   * This makes the consumer callback be called again while parsing
+   * the input at the previous cursor position again.  This is useful when
+   * the tokenizer state (custom tokens, tokenization mode) has changed and
+   * we want to re-parse the input again.
+   */
+  void Rollback();
+
+private:
+  // Loops over the input with TokenizerBase::Parse and calls the Consumer callback.
+  nsresult Process();
+
+#ifdef DEBUG
+  // True when inside the consumer callback, used only for assertions.
+  bool mConsuming;
+#endif // DEBUG
+  // Modifyable only from the Consumer callback, tells the parser to break, rollback
+  // and wait for more input.
+  bool mNeedMoreInput;
+  // Modifyable only from the Consumer callback, tells the parser to rollback and
+  // parse the input again, with (if modified) new settings of the tokenizer.
+  bool mRollback;
+  // The input buffer.  Updated with each call to Feed/FinishInput.
+  nsCString mInput;
+  // Numerical index pointing at the current cursor position.  We don't keep direct
+  // reference to the string buffer since the buffer gets often reallocated.
+  nsCString::index_type mInputCursor;
+  // Refernce to the consumer function.
+  Consumer mConsumer;
+};
+
+} // mozilla
+
+#endif
--- a/xpcom/ds/Tokenizer.cpp
+++ b/xpcom/ds/Tokenizer.cpp
@ -7,6 +7,7 @@
 #include "Tokenizer.h"

 #include "nsUnicharUtils.h"
+#include <algorithm>

 namespace mozilla {

@ -15,11 +16,9 @@ static const char sWhitespaces[] = " \t";
 Tokenizer::Tokenizer(const nsACString& aSource,
                     const char* aWhitespaces,
                     const char* aAdditionalWordChars)
-  : mPastEof(false)
-  , mHasFailed(false)
-  , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
-  , mAdditionalWordChars(aAdditionalWordChars)
+  : TokenizerBase(aWhitespaces, aAdditionalWordChars)
 {
+  mInputFinished = true;
  aSource.BeginReading(mCursor);
  mRecord = mRollback = mCursor;
  aSource.EndReading(mEnd);
@ -43,7 +42,7 @@ Tokenizer::Next(Token& aToken)
  mRollback = mCursor;
  mCursor = Parse(aToken);

-  aToken.AssignFragment(mRollback, mCursor);
+  AssignFragment(aToken, mRollback, mCursor);

  mPastEof = aToken.Type() == TOKEN_EOF;
  mHasFailed = false;
@ -67,7 +66,7 @@ Tokenizer::Check(const TokenType aTokenType, Token& aResult)
  mRollback = mCursor;
  mCursor = next;

-  aResult.AssignFragment(mRollback, mCursor);
+  AssignFragment(aResult, mRollback, mCursor);

  mPastEof = aResult.Type() == TOKEN_EOF;
  mHasFailed = false;
@ -96,12 +95,6 @@ Tokenizer::Check(const Token& aToken)
  return true;
 }

-bool
-Tokenizer::HasFailed() const
-{
-  return mHasFailed;
-}
-
 void
 Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines)
 {
@ -275,24 +268,156 @@ Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
  aResult.Rebind(mRecord, close - mRecord);
 }

-// protected
+// TokenizerBase
+
+TokenizerBase::TokenizerBase(const char* aWhitespaces,
+                             const char* aAdditionalWordChars)
+  : mPastEof(false)
+  , mHasFailed(false)
+  , mInputFinished(true)
+  , mMode(Mode::FULL)
+  , mMinRawDelivery(1024)
+  , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
+  , mAdditionalWordChars(aAdditionalWordChars)
+  , mCursor(nullptr)
+  , mEnd(nullptr)
+  , mNextCustomTokenID(TOKEN_CUSTOM0)
+{
+}
+
+TokenizerBase::Token
+TokenizerBase::AddCustomToken(const nsACString & aValue,
+                              ECaseSensitivity aCaseInsensitivity, bool aEnabled)
+{
+  MOZ_ASSERT(!aValue.IsEmpty());
+
+  UniquePtr<Token>& t = *mCustomTokens.AppendElement();
+  t = MakeUnique<Token>();
+
+  t->mType = static_cast<TokenType>(++mNextCustomTokenID);
+  t->mCustomCaseInsensitivity = aCaseInsensitivity;
+  t->mCustomEnabled = aEnabled;
+  t->mCustom.Assign(aValue);
+  return *t;
+}
+
+void
+TokenizerBase::RemoveCustomToken(Token& aToken)
+{
+  if (aToken.mType == TOKEN_UNKNOWN) {
+    // Already removed
+    return;
+  }
+
+  for (UniquePtr<Token> const& custom : mCustomTokens) {
+    if (custom->mType == aToken.mType) {
+      mCustomTokens.RemoveElement(custom);
+      aToken.mType = TOKEN_UNKNOWN;
+      return;
+    }
+  }
+
+  MOZ_ASSERT(false, "Token to remove not found");
+}
+
+void
+TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled)
+{
+  if (aToken.mType == TOKEN_UNKNOWN) {
+    // Already removed
+    return;
+  }
+
+  for (UniquePtr<Token> const& custom : mCustomTokens) {
+    if (custom->Type() == aToken.Type()) {
+      // This effectively destroys the token instance.
+      custom->mCustomEnabled = aEnabled;
+      return;
+    }
+  }
+
+  MOZ_ASSERT(false, "Token to change not found");
+}
+
+void
+TokenizerBase::SetTokenizingMode(Mode aMode)
+{
+  mMode = aMode;
+}

 bool
-Tokenizer::HasInput() const
+TokenizerBase::HasFailed() const
+{
+  return mHasFailed;
+}
+
+bool
+TokenizerBase::HasInput() const
 {
  return !mPastEof;
 }

 nsACString::const_char_iterator
-Tokenizer::Parse(Token& aToken) const
+TokenizerBase::Parse(Token& aToken) const
 {
  if (mCursor == mEnd) {
+    if (!mInputFinished) {
+      return mCursor;
+    }
+
    aToken = Token::EndOfFile();
    return mEnd;
  }

+  nsACString::size_type available = mEnd - mCursor;
+
+  uint32_t longestCustom = 0;
+  for (UniquePtr<Token> const& custom : mCustomTokens) {
+    if (IsCustom(mCursor, *custom, &longestCustom)) {
+      aToken = *custom;
+      return mCursor + custom->mCustom.Length();
+    }
+  }
+
+  if (!mInputFinished && available < longestCustom) {
+    // Not enough data to deterministically decide.
+    return mCursor;
+  }
+
  nsACString::const_char_iterator next = mCursor;

+  if (mMode == Mode::CUSTOM_ONLY) {
+    // We have to do a brute-force search for all of the enabled custom
+    // tokens.
+    while (next < mEnd) {
+      ++next;
+      for (UniquePtr<Token> const& custom : mCustomTokens) {
+        if (IsCustom(next, *custom)) {
+          aToken = Token::Raw();
+          return next;
+        }
+      }
+    }
+
+    if (mInputFinished) {
+      // End of the data reached.
+      aToken = Token::Raw();
+      return next;
+    }
+
+    if (longestCustom < available && available > mMinRawDelivery) {
+      // We can return some data w/o waiting for either a custom token
+      // or call to FinishData() when we leave the tail where all the
+      // custom tokens potentially fit, so we can't lose only partially
+      // delivered tokens.  This preserves reasonable granularity.
+      aToken = Token::Raw();
+      return mEnd - longestCustom + 1;
+    }
+
+    // Not enough data to deterministically decide.
+    return mCursor;
+  }
+
  enum State {
    PARSE_INTEGER,
    PARSE_WORD,
@ -326,6 +451,9 @@ Tokenizer::Parse(Token& aToken) const
      resultingNumber += static_cast<uint64_t>(*next - '0');

      ++next;
+      if (IsPending(next)) {
+        break;
+      }
      if (IsEnd(next) || !IsNumber(*next)) {
        if (!resultingNumber.isValid()) {
          aToken = Token::Error();
@ -338,6 +466,9 @@ Tokenizer::Parse(Token& aToken) const

    case PARSE_WORD:
      ++next;
+      if (IsPending(next)) {
+        break;
+      }
      if (IsEnd(next) || !IsWord(*next)) {
        aToken = Token::Word(Substring(mCursor, next));
        return next;
@ -346,6 +477,9 @@ Tokenizer::Parse(Token& aToken) const

    case PARSE_CRLF:
      ++next;
+      if (IsPending(next)) {
+        break;
+      }
      if (!IsEnd(next) && *next == '\n') { // LF is optional
        ++next;
      }
@ -369,17 +503,24 @@ Tokenizer::Parse(Token& aToken) const
    } // switch (state)
  } // while (next < end)

-  return next;
+  MOZ_ASSERT(!mInputFinished);
+  return mCursor;
 }

 bool
-Tokenizer::IsEnd(const nsACString::const_char_iterator& caret) const
+TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const
 {
  return caret == mEnd;
 }

 bool
-Tokenizer::IsWordFirst(const char aInput) const
+TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const
+{
+  return IsEnd(caret) && !mInputFinished;
+}
+
+bool
+TokenizerBase::IsWordFirst(const char aInput) const
 {
  // TODO: make this fully work with unicode
  return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
@ -389,50 +530,107 @@ Tokenizer::IsWordFirst(const char aInput) const
 }

 bool
-Tokenizer::IsWord(const char aInput) const
+TokenizerBase::IsWord(const char aInput) const
 {
  return IsWordFirst(aInput) || IsNumber(aInput);
 }

 bool
-Tokenizer::IsNumber(const char aInput) const
+TokenizerBase::IsNumber(const char aInput) const
 {
  // TODO: are there unicode numbers?
  return aInput >= '0' && aInput <= '9';
 }

-// Tokenizer::Token
+bool
+TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret,
+                        const Token & aCustomToken,
+                        uint32_t * aLongest) const
+{
+  MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
+  if (!aCustomToken.mCustomEnabled) {
+    return false;
+  }

-Tokenizer::Token::Token(const Token& aOther)
+  if (aLongest) {
+    *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length());
+  }
+
+  uint32_t inputLength = mEnd - caret;
+  if (aCustomToken.mCustom.Length() > inputLength) {
+    return false;
+  }
+
+  nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length());
+  if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
+    return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator());
+  }
+  return inputFragment.Equals(aCustomToken.mCustom);
+}
+
+void TokenizerBase::AssignFragment(Token& aToken,
+                                   nsACString::const_char_iterator begin,
+                                   nsACString::const_char_iterator end)
+{
+  aToken.AssignFragment(begin, end);
+}
+
+// TokenizerBase::Token
+
+TokenizerBase::Token::Token()
+  : mType(TOKEN_UNKNOWN)
+  , mChar(0)
+  , mInteger(0)
+  , mCustomCaseInsensitivity(CASE_SENSITIVE)
+  , mCustomEnabled(false)
+{
+}
+
+TokenizerBase::Token::Token(const Token& aOther)
  : mType(aOther.mType)
+  , mCustom(aOther.mCustom)
  , mChar(aOther.mChar)
  , mInteger(aOther.mInteger)
+  , mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity)
+  , mCustomEnabled(aOther.mCustomEnabled)
 {
-  if (mType == TOKEN_WORD) {
+  if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
    mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
  }
 }

-Tokenizer::Token&
-Tokenizer::Token::operator=(const Token& aOther)
+TokenizerBase::Token&
+TokenizerBase::Token::operator=(const Token& aOther)
 {
  mType = aOther.mType;
+  mCustom = aOther.mCustom;
  mChar = aOther.mChar;
  mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
  mInteger = aOther.mInteger;
+  mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
+  mCustomEnabled = aOther.mCustomEnabled;
  return *this;
 }

 void
-Tokenizer::Token::AssignFragment(nsACString::const_char_iterator begin,
+TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin,
                                     nsACString::const_char_iterator end)
 {
  mFragment.Rebind(begin, end - begin);
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::Word(const nsACString& aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Raw()
+{
+  Token t;
+  t.mType = TOKEN_RAW;
+  return t;
+}
+
+// static
+TokenizerBase::Token
+TokenizerBase::Token::Word(const nsACString& aValue)
 {
  Token t;
  t.mType = TOKEN_WORD;
@ -441,8 +639,8 @@ Tokenizer::Token::Word(const nsACString& aValue)
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::Char(const char aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Char(const char aValue)
 {
  Token t;
  t.mType = TOKEN_CHAR;
@ -451,8 +649,8 @@ Tokenizer::Token::Char(const char aValue)
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::Number(const uint64_t aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Number(const uint64_t aValue)
 {
  Token t;
  t.mType = TOKEN_INTEGER;
@ -461,8 +659,8 @@ Tokenizer::Token::Number(const uint64_t aValue)
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::Whitespace()
+TokenizerBase::Token
+TokenizerBase::Token::Whitespace()
 {
  Token t;
  t.mType = TOKEN_WS;
@ -471,8 +669,8 @@ Tokenizer::Token::Whitespace()
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::NewLine()
+TokenizerBase::Token
+TokenizerBase::Token::NewLine()
 {
  Token t;
  t.mType = TOKEN_EOL;
@ -480,8 +678,8 @@ Tokenizer::Token::NewLine()
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::EndOfFile()
+TokenizerBase::Token
+TokenizerBase::Token::EndOfFile()
 {
  Token t;
  t.mType = TOKEN_EOF;
@ -489,8 +687,8 @@ Tokenizer::Token::EndOfFile()
 }

 // static
-Tokenizer::Token
-Tokenizer::Token::Error()
+TokenizerBase::Token
+TokenizerBase::Token::Error()
 {
  Token t;
  t.mType = TOKEN_ERROR;
@ -498,7 +696,7 @@ Tokenizer::Token::Error()
 }

 bool
-Tokenizer::Token::Equals(const Token& aOther) const
+TokenizerBase::Token::Equals(const Token& aOther) const
 {
  if (mType != aOther.mType) {
    return false;
@ -517,21 +715,21 @@ Tokenizer::Token::Equals(const Token& aOther) const
 }

 char
-Tokenizer::Token::AsChar() const
+TokenizerBase::Token::AsChar() const
 {
  MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
  return mChar;
 }

 nsDependentCSubstring
-Tokenizer::Token::AsString() const
+TokenizerBase::Token::AsString() const
 {
  MOZ_ASSERT(mType == TOKEN_WORD);
  return mWord;
 }

 uint64_t
-Tokenizer::Token::AsInteger() const
+TokenizerBase::Token::AsInteger() const
 {
  MOZ_ASSERT(mType == TOKEN_INTEGER);
  return mInteger;
--- a/xpcom/ds/Tokenizer.h
+++ b/xpcom/ds/Tokenizer.h
@ -9,32 +9,36 @@

 #include "nsString.h"
 #include "mozilla/CheckedInt.h"
+#include "mozilla/UniquePtr.h"
+#include "nsTArray.h"

 namespace mozilla {

-/**
- * This is a simple implementation of a lexical analyzer or maybe better
- * called a tokenizer.  It doesn't allow any user dictionaries or
- * user define token types.
- *
- * It is limited only to ASCII input for now. UTF-8 or any other input
- * encoding must yet be implemented.
- */
-class Tokenizer {
+class TokenizerBase
+{
 public:
  /**
   * The analyzer works with elements in the input cut to a sequence of token
   * where each token has an elementary type
   */
-  enum TokenType {
+  enum TokenType : uint32_t
+  {
    TOKEN_UNKNOWN,
+    TOKEN_RAW,
    TOKEN_ERROR,
    TOKEN_INTEGER,
    TOKEN_WORD,
    TOKEN_CHAR,
    TOKEN_WS,
    TOKEN_EOL,
-    TOKEN_EOF
+    TOKEN_EOF,
+    TOKEN_CUSTOM0 = 1000
+  };
+
+  enum ECaseSensitivity
+  {
+    CASE_SENSITIVE,
+    CASE_INSENSITIVE
  };

  /**
@ -42,23 +46,29 @@ public:
   * to allow checks against it via methods of Tokenizer or are results of some of
   * the Tokenizer's methods.
   */
-  class Token {
+  class Token
+  {
    TokenType mType;
    nsDependentCSubstring mWord;
+    nsCString mCustom;
    char mChar;
    uint64_t mInteger;
+    ECaseSensitivity mCustomCaseInsensitivity;
+    bool mCustomEnabled;

    // If this token is a result of the parsing process, this member is referencing
    // a sub-string in the input buffer.  If this is externally created Token this
    // member is left an empty string.
    nsDependentCSubstring mFragment;

-    friend class Tokenizer;
+    friend class TokenizerBase;
    void AssignFragment(nsACString::const_char_iterator begin,
                        nsACString::const_char_iterator end);

+    static Token Raw();
+
  public:
-    Token() : mType(TOKEN_UNKNOWN), mChar(0), mInteger(0) {}
+    Token();
    Token(const Token& aOther);
    Token& operator=(const Token& aOther);

@ -83,6 +93,120 @@ public:
    nsDependentCSubstring Fragment() const { return mFragment; }
  };

+  /**
+   * Consumers may register a custom string that, when found in the input, is considered
+   * a token and returned by Next*() and accepted by Check*() methods.
+   * AddCustomToken() returns a reference to a token that can then be comapred using
+   * Token::Equals() againts the output from Next*() or be passed to Check*().
+   */
+  Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
+  template <uint32_t N>
+  Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
+  {
+    return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
+  }
+  void RemoveCustomToken(Token& aToken);
+  /**
+   * Only applies to a custom type of a Token (see AddCustomToken above.)
+   * This turns on and off token recognition.  When a custom token is disabled,
+   * it's ignored as never added as a custom token.
+   */
+  void EnableCustomToken(Token const& aToken, bool aEnable);
+
+  /**
+   * Mode of tokenization.
+   * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
+   * if added.
+   * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
+   * This mode can be understood as a 'binary' mode.
+   */
+  enum class Mode
+  {
+    FULL,
+    CUSTOM_ONLY
+  };
+  void SetTokenizingMode(Mode aMode);
+
+  /**
+   * Return false iff the last Check*() call has returned false or when we've read past
+   * the end of the input string.
+   */
+  MOZ_MUST_USE bool HasFailed() const;
+
+protected:
+  explicit TokenizerBase(const char* aWhitespaces = nullptr,
+                         const char* aAdditionalWordChars = nullptr);
+
+  // false if we have already read the EOF token.
+  bool HasInput() const;
+  // Main parsing function, it doesn't shift the read cursor, just returns the next
+  // token position.
+  nsACString::const_char_iterator Parse(Token& aToken) const;
+  // Is read cursor at the end?
+  bool IsEnd(const nsACString::const_char_iterator& caret) const;
+  // True, when we are at the end of the input data, but it has not been marked
+  // as complete yet.  In that case we cannot proceed with providing a multi-char token.
+  bool IsPending(const nsACString::const_char_iterator & caret) const;
+  // Is read cursor on a character that is a word start?
+  bool IsWordFirst(const char aInput) const;
+  // Is read cursor on a character that is an in-word letter?
+  bool IsWord(const char aInput) const;
+  // Is read cursor on a character that is a valid number?
+  // TODO - support multiple radix
+  bool IsNumber(const char aInput) const;
+  // Is equal to the given custom token?
+  bool IsCustom(const nsACString::const_char_iterator& caret,
+                const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
+
+  // Friendly helper to assign a fragment on a Token
+  static void AssignFragment(Token& aToken,
+                             nsACString::const_char_iterator begin,
+                             nsACString::const_char_iterator end);
+
+  // true iff we have already read the EOF token
+  bool mPastEof;
+  // true iff the last Check*() call has returned false, reverts to true on Rollback() call
+  bool mHasFailed;
+  // true if the input string is final (finished), false when we expect more data
+  // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
+  bool mInputFinished;
+  // custom only vs full tokenizing mode, see the Parse() method
+  Mode mMode;
+  // minimal raw data chunked delivery during incremental feed
+  uint32_t mMinRawDelivery;
+
+  // Customizable list of whitespaces
+  const char* mWhitespaces;
+  // Additinal custom word characters
+  const char* mAdditionalWordChars;
+
+  // All these point to the original buffer passed to the constructor or to the incremental
+  // buffer after FeedInput.
+  nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
+  nsACString::const_char_iterator mEnd; // End of the input position
+
+  // This is the list of tokens user has registered with AddCustomToken()
+  nsTArray<UniquePtr<Token>> mCustomTokens;
+  uint32_t mNextCustomTokenID;
+
+private:
+  TokenizerBase() = delete;
+  TokenizerBase(const TokenizerBase&) = delete;
+  TokenizerBase(TokenizerBase&&) = delete;
+  TokenizerBase(const TokenizerBase&&) = delete;
+  TokenizerBase &operator=(const TokenizerBase&) = delete;
+};
+
+/**
+ * This is a simple implementation of a lexical analyzer or maybe better
+ * called a tokenizer.  It doesn't allow any user dictionaries or
+ * user define token types.
+ *
+ * It is limited only to ASCII input for now. UTF-8 or any other input
+ * encoding must yet be implemented.
+ */
+class Tokenizer : public TokenizerBase
+{
 public:
  /**
   * @param aSource
@ -133,13 +257,6 @@ public:
  MOZ_MUST_USE
  bool Check(const Token& aToken);

-  /**
-   * Return false iff the last Check*() call has returned false or when we've read past
-   * the end of the input string.
-   */
-  MOZ_MUST_USE
-  bool HasFailed() const;
-
  /**
   * SkipWhites method (below) may also skip new line characters automatically.
   */
@ -312,36 +429,9 @@ public:
                              ClaimInclusion aInclude = EXCLUDE_LAST);

 protected:
-  // false if we have already read the EOF token.
-  bool HasInput() const;
-  // Main parsing function, it doesn't shift the read cursor, just returns the next
-  // token position.
-  nsACString::const_char_iterator Parse(Token& aToken) const;
-  // Is read cursor at the end?
-  bool IsEnd(const nsACString::const_char_iterator& caret) const;
-  // Is read cursor on a character that is a word start?
-  bool IsWordFirst(const char aInput) const;
-  // Is read cursor on a character that is an in-word letter?
-  bool IsWord(const char aInput) const;
-  // Is read cursor on a character that is a valid number?
-  // TODO - support multiple radix
-  bool IsNumber(const char aInput) const;
-
-  // true iff we have already read the EOF token
-  bool mPastEof;
-  // true iff the last Check*() call has returned false, reverts to true on Rollback() call
-  bool mHasFailed;
-
-  // Customizable list of whitespaces
-  const char* mWhitespaces;
-  // Additinal custom word characters
-  const char* mAdditionalWordChars;
-
-  // All these point to the original buffer passed to the Tokenizer
+  // All these point to the original buffer passed to the Tokenizer's constructor
  nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
  nsACString::const_char_iterator mRollback; // Position of the previous token start
-  nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
-  nsACString::const_char_iterator mEnd; // End of the input position

 private:
  Tokenizer() = delete;
--- a/xpcom/ds/moz.build
+++ b/xpcom/ds/moz.build
@ -83,12 +83,14 @@ EXPORTS += [
 ]

 EXPORTS.mozilla += [
+    'IncrementalTokenizer.h',
    'Observer.h',
    'StickyTimeDuration.h',
    'Tokenizer.h',
 ]

 UNIFIED_SOURCES += [
+    'IncrementalTokenizer.cpp',
    'nsArray.cpp',
    'nsArrayEnumerator.cpp',
    'nsArrayUtils.cpp',
--- a/xpcom/tests/gtest/TestTokenizer.cpp
+++ b/xpcom/tests/gtest/TestTokenizer.cpp
@ -5,6 +5,8 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #include "mozilla/Tokenizer.h"
+#include "mozilla/IncrementalTokenizer.h"
+#include "mozilla/Unused.h"
 #include "gtest/gtest.h"

 using namespace mozilla;
@ -732,3 +734,401 @@ TEST(Tokenizer, SkipUntil)
    EXPECT_TRUE(p.CheckEOF());
  }
 }
+
+TEST(Tokenizer, Custom)
+{
+  Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
+
+  Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
+  Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
+
+  // It's expected to NOT FIND the custom token if it's not on an edge
+  // between other recognizable tokens.
+  EXPECT_TRUE(p.CheckWord("aaaaaacustom"));
+  EXPECT_TRUE(p.CheckChar('-'));
+  EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
+  EXPECT_TRUE(p.CheckEOL());
+  EXPECT_TRUE(p.CheckChar(','));
+
+  EXPECT_TRUE(p.Check(c1));
+  EXPECT_TRUE(p.CheckChar(','));
+
+  EXPECT_TRUE(p.Check(c1));
+  EXPECT_TRUE(p.CheckChar(','));
+
+  p.EnableCustomToken(c1, false);
+  EXPECT_TRUE(p.CheckWord("Custom"));
+  EXPECT_TRUE(p.CheckChar('-'));
+  EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
+  EXPECT_TRUE(p.CheckChar(','));
+
+  EXPECT_TRUE(p.Check(Tokenizer::Token::Number(0)));
+  EXPECT_TRUE(p.Check(c2));
+  EXPECT_TRUE(p.CheckWord("xxxx"));
+  EXPECT_TRUE(p.CheckChar(','));
+
+  EXPECT_TRUE(p.CheckWord("CUSTOM"));
+  EXPECT_TRUE(p.CheckChar('-'));
+  EXPECT_TRUE(p.Check(Tokenizer::Token::Number(2)));
+
+  EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, CustomRaw)
+{
+  Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
+
+  Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
+  Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
+
+  // In this mode it's expected to find all custom tokens among any kind of input.
+  p.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+  Tokenizer::Token t;
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral("aaaaaa"));
+
+  EXPECT_TRUE(p.Check(c1));
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral("\r,"));
+
+  EXPECT_TRUE(p.Check(c1));
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
+
+  EXPECT_TRUE(p.Check(c1));
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
+
+  EXPECT_TRUE(p.Check(c1));
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral(",00"));
+
+  EXPECT_TRUE(p.Check(c2));
+
+  EXPECT_TRUE(p.Next(t));
+  EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+  EXPECT_TRUE(t.Fragment().EqualsLiteral("xxxx,CUSTOM-2"));
+
+  EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, Incremental)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
+      case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
+      case 4: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 7: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
+      case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    return NS_OK;
+  });
+
+  NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
+  auto cur = input.BeginReading();
+  auto end = input.EndReading();
+  for (; cur < end; ++cur) {
+    i.FeedInput(nsDependentCSubstring(cur, 1));
+  }
+
+  EXPECT_TRUE(test == 6);
+  i.FinishInput();
+  EXPECT_TRUE(test == 8);
+}
+
+TEST(Tokenizer, IncrementalRollback)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
+      case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2"))));
+        i.Rollback(); // so that we get the token again
+        break;
+      case 4: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
+      case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 7: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+      case 8: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
+      case 9: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    return NS_OK;
+  });
+
+  NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
+  auto cur = input.BeginReading();
+  auto end = input.EndReading();
+  for (; cur < end; ++cur) {
+    i.FeedInput(nsDependentCSubstring(cur, 1));
+  }
+
+  EXPECT_TRUE(test == 7);
+  i.FinishInput();
+  EXPECT_TRUE(test == 9);
+}
+
+TEST(Tokenizer, IncrementalNeedMoreInput)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    Token t2;
+    switch (++test) {
+    case 1:
+      EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("a"))));
+      break;
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      EXPECT_TRUE(t.Equals(Token::Whitespace()));
+      if (i.Next(t2)) {
+        EXPECT_TRUE(test == 5);
+        EXPECT_TRUE(t2.Equals(Token::Word(NS_LITERAL_CSTRING("bb"))));
+      } else {
+        EXPECT_TRUE(test < 5);
+        i.NeedMoreInput();
+      }
+      break;
+    case 6:
+      EXPECT_TRUE(t.Equals(Token::Char(',')));
+      break;
+    case 7:
+      EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("c"))));
+      return NS_ERROR_FAILURE;
+    default:
+      EXPECT_TRUE(false);
+      break;
+    }
+
+    return NS_OK;
+  });
+
+  NS_NAMED_LITERAL_CSTRING(input, "a bb,c");
+  auto cur = input.BeginReading();
+  auto end = input.EndReading();
+
+  nsresult rv;
+  for (; cur < end; ++cur) {
+    rv = i.FeedInput(nsDependentCSubstring(cur, 1));
+    if (NS_FAILED(rv)) {
+      break;
+    }
+  }
+
+  EXPECT_TRUE(rv == NS_OK);
+  EXPECT_TRUE(test == 6);
+
+  rv = i.FinishInput();
+  EXPECT_TRUE(rv == NS_ERROR_FAILURE);
+  EXPECT_TRUE(test == 7);
+}
+
+TEST(Tokenizer, IncrementalCustom)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  Token custom;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Equals(custom)); break;
+      case 2: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("bla")))); break;
+      case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    return NS_OK;
+  }, nullptr, "-");
+
+  custom = i.AddCustomToken("some-test", Tokenizer::CASE_SENSITIVE);
+  i.FeedInput(NS_LITERAL_CSTRING("some-"));
+  EXPECT_TRUE(test == 0);
+  i.FeedInput(NS_LITERAL_CSTRING("tes"));
+  EXPECT_TRUE(test == 0);
+  i.FeedInput(NS_LITERAL_CSTRING("tbla"));
+  EXPECT_TRUE(test == 1);
+  i.FinishInput();
+  EXPECT_TRUE(test == 3);
+}
+
+TEST(Tokenizer, IncrementalCustomRaw)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  Token custom;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("test1,")); break;
+      case 2: EXPECT_TRUE(t.Equals(custom)); break;
+      case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("!,,test3"));
+        i.Rollback();
+        i.SetTokenizingMode(Tokenizer::Mode::FULL);
+        break;
+      case 4: EXPECT_TRUE(t.Equals(Token::Char('!')));
+        i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+        break;
+      case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral(",,test3")); break;
+      case 6: EXPECT_TRUE(t.Equals(custom)); break;
+      case 7: EXPECT_TRUE(t.Fragment().EqualsLiteral("tes")); break;
+      case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    return NS_OK;
+  });
+
+  custom = i.AddCustomToken("test2", Tokenizer::CASE_SENSITIVE);
+  i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+  NS_NAMED_LITERAL_CSTRING(input, "test1,test2!,,test3test2tes");
+  auto cur = input.BeginReading();
+  auto end = input.EndReading();
+  for (; cur < end; ++cur) {
+    i.FeedInput(nsDependentCSubstring(cur, 1));
+  }
+
+  EXPECT_TRUE(test == 6);
+  i.FinishInput();
+  EXPECT_TRUE(test == 8);
+}
+
+TEST(Tokenizer, IncrementalCustomRemove)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  Token custom;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Equals(custom));
+        i.RemoveCustomToken(custom);
+        break;
+      case 2: EXPECT_FALSE(t.Equals(custom)); break;
+      case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    return NS_OK;
+  });
+
+  custom = i.AddCustomToken("custom1", Tokenizer::CASE_SENSITIVE);
+
+  NS_NAMED_LITERAL_CSTRING(input, "custom1custom1");
+  i.FeedInput(input);
+  EXPECT_TRUE(test == 1);
+  i.FinishInput();
+  EXPECT_TRUE(test == 3);
+}
+
+TEST(Tokenizer, IncrementalBuffering1)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  Token custom;
+  nsDependentCSubstring observedFragment;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("012")); break;
+      case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("3456789")); break;
+      case 3: EXPECT_TRUE(t.Equals(custom)); break;
+      case 4: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwe")); break;
+      case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("rt")); break;
+      case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+
+    observedFragment.Rebind(t.Fragment().BeginReading(),
+                            t.Fragment().Length());
+    return NS_OK;
+  }, nullptr, nullptr, 3);
+
+  custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
+  // This externally unused token is added only to check the internal algorithm
+  // does work correctly as expected when there are two different length tokens.
+  Unused << i.AddCustomToken("bb", Tokenizer::CASE_SENSITIVE);
+  i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+  i.FeedInput(NS_LITERAL_CSTRING("01234"));
+  EXPECT_TRUE(test == 1);
+  EXPECT_TRUE(observedFragment.EqualsLiteral("012"));
+
+  i.FeedInput(NS_LITERAL_CSTRING("5"));
+  EXPECT_TRUE(test == 1);
+  i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
+  EXPECT_TRUE(test == 2);
+  EXPECT_TRUE(observedFragment.EqualsLiteral("3456789"));
+
+  i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
+  EXPECT_TRUE(test == 4);
+  EXPECT_TRUE(observedFragment.EqualsLiteral("qwe"));
+
+  i.FinishInput();
+  EXPECT_TRUE(test == 6);
+}
+
+TEST(Tokenizer, IncrementalBuffering2)
+{
+  typedef TokenizerBase::Token Token;
+
+  int test = 0;
+  Token custom;
+  IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+  {
+    switch (++test) {
+      case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("01")); break;
+      case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("234567")); break;
+      case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("89")); break;
+      case 4: EXPECT_TRUE(t.Equals(custom)); break;
+      case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwert")); break;
+      case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+    }
+    return NS_OK;
+  }, nullptr, nullptr, 3);
+
+  custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
+  // This externally unused token is added only to check the internal algorithm
+  // does work correctly as expected when there are two different length tokens.
+  Unused << i.AddCustomToken("bbbbb", Tokenizer::CASE_SENSITIVE);
+  i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+  i.FeedInput(NS_LITERAL_CSTRING("01234"));
+  EXPECT_TRUE(test == 0);
+  i.FeedInput(NS_LITERAL_CSTRING("5"));
+  EXPECT_TRUE(test == 1);
+  i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
+  EXPECT_TRUE(test == 2);
+  i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
+  EXPECT_TRUE(test == 4);
+  i.FinishInput();
+  EXPECT_TRUE(test == 6);
+}