gecko-dev/parser/htmlparser/nsScanner.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=2 sw=2 et tw=78: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//#define __INCREMENTAL 1

#include "nsScanner.h"

#include "mozilla/Attributes.h"
#include "mozilla/DebugOnly.h"
#include "mozilla/Encoding.h"
#include "nsDebug.h"
#include "nsReadableUtils.h"
#include "nsIInputStream.h"
#include "nsIFile.h"
#include "nsUTF8Utils.h"  // for LossyConvertEncoding
#include "nsCRT.h"
#include "nsParser.h"
#include "nsCharsetSource.h"

nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars)
    : mChars(aTerminateChars),
      mFilter(char16_t(~0))  // All bits set
{
  // Build filter that will be used to filter out characters with
  // bits that none of the terminal chars have. This works very well
  // because terminal chars often have only the last 4-6 bits set and
  // normal ascii letters have bit 7 set. Other letters have even higher
  // bits set.

  // Calculate filter
  const char16_t* current = aTerminateChars;
  char16_t terminalChar = *current;
  while (terminalChar) {
    mFilter &= ~terminalChar;
    ++current;
    terminalChar = *current;
  }
}

/**
 *  Use this constructor if you want i/o to be based on
 *  a single string you hand in during construction.
 *  This short cut was added for Javascript.
 *
 *  @update  gess 5/12/98
 *  @param   aMode represents the parser mode (nav, other)
 *  @return
 */
nsScanner::nsScanner(const nsAString& anHTMLString) {
  MOZ_COUNT_CTOR(nsScanner);

  mSlidingBuffer = nullptr;
  if (AppendToBuffer(anHTMLString)) {
    mSlidingBuffer->BeginReading(mCurrentPosition);
  } else {
    /* XXX see hack below, re: bug 182067 */
    memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
    mEndPosition = mCurrentPosition;
  }
  mMarkPosition = mCurrentPosition;
  mIncremental = false;
  mUnicodeDecoder = nullptr;
  mCharsetSource = kCharsetUninitialized;
}

/**
 *  Use this constructor if you want i/o to be based on strings
 *  the scanner receives. If you pass a null filename, you
 *  can still provide data to the scanner via append.
 */
nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
    : mFilename(aFilename) {
  MOZ_COUNT_CTOR(nsScanner);
  NS_ASSERTION(!aCreateStream, "This is always true.");

  mSlidingBuffer = nullptr;

  // XXX This is a big hack.  We need to initialize the iterators to something.
  // What matters is that mCurrentPosition == mEndPosition, so that our methods
  // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
  // so that we have some hope of catching null pointer dereferences associated
  // with this hack. --darin
  memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
  mMarkPosition = mCurrentPosition;
  mEndPosition = mCurrentPosition;

  mIncremental = true;

  mUnicodeDecoder = nullptr;
  mCharsetSource = kCharsetUninitialized;
  // XML defaults to UTF-8 and about:blank is UTF-8, too.
  SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
}

nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
                                       int32_t aSource) {
  if (aSource < mCharsetSource)  // priority is lower than the current one
    return NS_OK;

  mCharsetSource = aSource;
  nsCString charsetName;
  aEncoding->Name(charsetName);
  if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
    return NS_OK;  // no difference, don't change it
  }

  // different, need to change it

  mCharset.Assign(charsetName);

  mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();

  return NS_OK;
}

/**
 *  default destructor
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
nsScanner::~nsScanner() {
  delete mSlidingBuffer;

  MOZ_COUNT_DTOR(nsScanner);
}

/**
 *  Resets current offset position of input stream to marked position.
 *  This allows us to back up to this point if the need should arise,
 *  such as when tokenization gets interrupted.
 *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
 *
 *  @update  gess 5/12/98
 *  @param
 *  @return
 */
void nsScanner::RewindToMark(void) {
  if (mSlidingBuffer) {
    mCurrentPosition = mMarkPosition;
  }
}

/**
 *  Records current offset position in input stream. This allows us
 *  to back up to this point if the need should arise, such as when
 *  tokenization gets interrupted.
 *
 *  @update  gess 7/29/98
 *  @param
 *  @return
 */
int32_t nsScanner::Mark() {
  int32_t distance = 0;
  if (mSlidingBuffer) {
    nsScannerIterator oldStart;
    mSlidingBuffer->BeginReading(oldStart);

    distance = Distance(oldStart, mCurrentPosition);

    mSlidingBuffer->DiscardPrefix(mCurrentPosition);
    mSlidingBuffer->BeginReading(mCurrentPosition);
    mMarkPosition = mCurrentPosition;
  }

  return distance;
}

/**
 * Insert data to our underlying input buffer as
 * if it were read from an input stream.
 *
 * @update  harishd 01/12/99
 * @return  error code
 */
bool nsScanner::UngetReadable(const nsAString& aBuffer) {
  if (!mSlidingBuffer) {
    return false;
  }

  mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition);
  mSlidingBuffer->BeginReading(
      mCurrentPosition);  // Insertion invalidated our iterators
  mSlidingBuffer->EndReading(mEndPosition);

  return true;
}

/**
 * Append data to our underlying input buffer as
 * if it were read from an input stream.
 *
 * @update  gess4/3/98
 * @return  error code
 */
nsresult nsScanner::Append(const nsAString& aBuffer) {
  if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY;
  return NS_OK;
}

/**
 *
 *
 *  @update  gess 5/21/98
 *  @param
 *  @return
 */
nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) {
  nsresult res = NS_OK;
  if (mUnicodeDecoder) {
    CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen);
    if (!needed.isValid()) {
      return NS_ERROR_OUT_OF_MEMORY;
    }
    CheckedInt<uint32_t> allocLen(1);  // null terminator due to legacy sadness
    allocLen += needed.value();
    if (!allocLen.isValid()) {
      return NS_ERROR_OUT_OF_MEMORY;
    }
    nsScannerString::Buffer* buffer =
        nsScannerString::AllocBuffer(allocLen.value());
    NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY);
    char16_t* unichars = buffer->DataStart();

    uint32_t result;
    size_t read;
    size_t written;
    Tie(result, read, written) =
        mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
            AsBytes(MakeSpan(aBuffer, aLen)),
            MakeSpan(unichars, needed.value()),
            false);  // Retain bug about failure to handle EOF
    MOZ_ASSERT(result != kOutputFull);
    MOZ_ASSERT(read <= aLen);
    MOZ_ASSERT(written <= needed.value());
    if (result != kInputEmpty) {
      // Since about:blank is empty, this line runs only for XML. Use a
      // character that's illegal in XML instead of U+FFFD in order to make
      // expat flag the error. There is no need to loop and convert more, since
      // expat will stop here anyway.
      unichars[written++] = 0xFFFF;
    }
    buffer->SetDataLength(written);
    // Don't propagate return code of unicode decoder
    // since it doesn't reflect on our success or failure
    // - Ref. bug 87110
    res = NS_OK;
    if (!AppendToBuffer(buffer)) res = NS_ERROR_OUT_OF_MEMORY;
  } else {
    NS_WARNING("No decoder found.");
    res = NS_ERROR_FAILURE;
  }

  return res;
}

/**
 *  retrieve next char from scanners internal input stream
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return  error code reflecting read status
 */
nsresult nsScanner::GetChar(char16_t& aChar) {
  if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
    aChar = 0;
    return NS_ERROR_HTMLPARSER_EOF;
  }

  aChar = *mCurrentPosition++;

  return NS_OK;
}

void nsScanner::BindSubstring(nsScannerSubstring& aSubstring,
                              const nsScannerIterator& aStart,
                              const nsScannerIterator& aEnd) {
  aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
}

void nsScanner::CurrentPosition(nsScannerIterator& aPosition) {
  aPosition = mCurrentPosition;
}

void nsScanner::EndReading(nsScannerIterator& aPosition) {
  aPosition = mEndPosition;
}

void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) {
  if (mSlidingBuffer) {
    mCurrentPosition = aPosition;
    if (aTerminate && (mCurrentPosition == mEndPosition)) {
      mMarkPosition = mCurrentPosition;
      mSlidingBuffer->DiscardPrefix(mCurrentPosition);
    }
  }
}

bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) {
  if (!mSlidingBuffer) {
    mSlidingBuffer = new nsScannerString(aBuf);
    if (!mSlidingBuffer) return false;
    mSlidingBuffer->BeginReading(mCurrentPosition);
    mMarkPosition = mCurrentPosition;
    mSlidingBuffer->EndReading(mEndPosition);
  } else {
    mSlidingBuffer->AppendBuffer(aBuf);
    if (mCurrentPosition == mEndPosition) {
      mSlidingBuffer->BeginReading(mCurrentPosition);
    }
    mSlidingBuffer->EndReading(mEndPosition);
  }

  return true;
}

/**
 *  call this to copy bytes out of the scanner that have not yet been consumed
 *  by the tokenization process.
 *
 *  @update  gess 5/12/98
 *  @param   aCopyBuffer is where the scanner buffer will be copied to
 *  @return  true if OK or false on OOM
 */
bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
  if (!mSlidingBuffer) {
    aCopyBuffer.Truncate();
    return true;
  }

  nsScannerIterator start, end;
  start = mCurrentPosition;
  end = mEndPosition;

  return CopyUnicodeTo(start, end, aCopyBuffer);
}

/**
 *  Retrieve the name of the file that the scanner is reading from.
 *  In some cases, it's just a given name, because the scanner isn't
 *  really reading from a file.
 *
 *  @update  gess 5/12/98
 *  @return
 */
nsString& nsScanner::GetFilename(void) { return mFilename; }

/**
 *  Conduct self test. Actually, selftesting for this class
 *  occurs in the parser selftest.
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */

void nsScanner::SelfTest(void) {
#ifdef _DEBUG
#endif
}