gecko-dev/parser/htmlparser/nsScannerString.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <stdlib.h>
#include "nsScannerString.h"


  /**
   * nsScannerBufferList
   */

#define MAX_CAPACITY ((UINT32_MAX / sizeof(char16_t)) - \
                      (sizeof(Buffer) + sizeof(char16_t)))

nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBufferFromString( const nsAString& aString )
  {
    uint32_t len = aString.Length();
    Buffer* buf = AllocBuffer(len);

    if (buf)
      {
        nsAString::const_iterator source;
        aString.BeginReading(source);
        nsCharTraits<char16_t>::copy(buf->DataStart(), source.get(), len);
      }
    return buf;
  }

nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBuffer( uint32_t capacity )
  {
    if (capacity > MAX_CAPACITY)
      return nullptr;

    void* ptr = malloc(sizeof(Buffer) + (capacity + 1) * sizeof(char16_t));
    if (!ptr)
      return nullptr;

    Buffer* buf = new (ptr) Buffer();

    buf->mUsageCount = 0;
    buf->mDataEnd = buf->DataStart() + capacity;

    // XXX null terminate.  this shouldn't be required, but we do it because
    // nsScanner erroneously thinks it can dereference DataEnd :-(
    *buf->mDataEnd = char16_t(0);
    return buf;
  }

void
nsScannerBufferList::ReleaseAll()
  {
    while (!mBuffers.isEmpty())
      {
        Buffer* node = mBuffers.popFirst();
        //printf(">>> freeing buffer @%p\n", node);
        free(node);
      }
  }

void
nsScannerBufferList::SplitBuffer( const Position& pos )
  {
    // splitting to the right keeps the work string and any extant token
    // pointing to and holding a reference count on the same buffer.

    Buffer* bufferToSplit = pos.mBuffer;
    NS_ASSERTION(bufferToSplit, "null pointer");

    uint32_t splitOffset = pos.mPosition - bufferToSplit->DataStart();
    NS_ASSERTION(pos.mPosition >= bufferToSplit->DataStart() &&
                 splitOffset <= bufferToSplit->DataLength(),
                 "split offset is outside buffer");
    
    uint32_t len = bufferToSplit->DataLength() - splitOffset;
    Buffer* new_buffer = AllocBuffer(len);
    if (new_buffer)
      {
        nsCharTraits<char16_t>::copy(new_buffer->DataStart(),
                                      bufferToSplit->DataStart() + splitOffset,
                                      len);
        InsertAfter(new_buffer, bufferToSplit);
        bufferToSplit->SetDataLength(splitOffset);
      }
  }

void
nsScannerBufferList::DiscardUnreferencedPrefix( Buffer* aBuf )
  {
    if (aBuf == Head())
      {
        while (!mBuffers.isEmpty() && !Head()->IsInUse())
          {
            Buffer* buffer = Head();
            buffer->remove();
            free(buffer);
          }
      }
  }

size_t
nsScannerBufferList::Position::Distance( const Position& aStart, const Position& aEnd )
  {
    size_t result = 0;
    if (aStart.mBuffer == aEnd.mBuffer)
      {
        result = aEnd.mPosition - aStart.mPosition;
      }
    else
      {
        result = aStart.mBuffer->DataEnd() - aStart.mPosition;
        for (Buffer* b = aStart.mBuffer->Next(); b != aEnd.mBuffer; b = b->Next())
          result += b->DataLength();
        result += aEnd.mPosition - aEnd.mBuffer->DataStart();
      }
    return result;
  }


/**
 * nsScannerSubstring
 */

nsScannerSubstring::nsScannerSubstring()
  : mStart(nullptr, nullptr)
  , mEnd(nullptr, nullptr)
  , mBufferList(nullptr)
  , mLength(0)
  , mIsDirty(true)
  {
  }

nsScannerSubstring::nsScannerSubstring( const nsAString& s )
  : mBufferList(nullptr)
  , mIsDirty(true)
  {
    Rebind(s);
  }

nsScannerSubstring::~nsScannerSubstring()
  {
    release_ownership_of_buffer_list();
  }

int32_t
nsScannerSubstring::CountChar( char16_t c ) const
  {
      /*
        re-write this to use a counting sink
       */

    size_type result = 0;
    size_type lengthToExamine = Length();

    nsScannerIterator iter;
    for ( BeginReading(iter); ; )
      {
        int32_t lengthToExamineInThisFragment = iter.size_forward();
        const char16_t* fromBegin = iter.get();
        result += size_type(NS_COUNT(fromBegin, fromBegin+lengthToExamineInThisFragment, c));
        if ( !(lengthToExamine -= lengthToExamineInThisFragment) )
          return result;
        iter.advance(lengthToExamineInThisFragment);
      }
      // never reached; quiets warnings
    return 0;
  }

void
nsScannerSubstring::Rebind( const nsScannerSubstring& aString,
                            const nsScannerIterator& aStart, 
                            const nsScannerIterator& aEnd )
  {
    // allow for the case where &aString == this

    aString.acquire_ownership_of_buffer_list();
    release_ownership_of_buffer_list();

    mStart      = aStart;
    mEnd        = aEnd;
    mBufferList = aString.mBufferList;
    mLength     = Distance(aStart, aEnd);
    mIsDirty    = true;
  }

void
nsScannerSubstring::Rebind( const nsAString& aString )
  {
    release_ownership_of_buffer_list();

    mBufferList = new nsScannerBufferList(AllocBufferFromString(aString));
    mIsDirty    = true;

    init_range_from_buffer_list();
    acquire_ownership_of_buffer_list();
  }

const nsSubstring&
nsScannerSubstring::AsString() const
  {
    if (mIsDirty)
      {
        nsScannerSubstring* mutable_this = const_cast<nsScannerSubstring*>(this);

        if (mStart.mBuffer == mEnd.mBuffer) {
          // We only have a single fragment to deal with, so just return it
          // as a substring.
          mutable_this->mFlattenedRep.Rebind(mStart.mPosition, mEnd.mPosition);
        } else {
          // Otherwise, we need to copy the data into a flattened buffer.
          nsScannerIterator start, end;
          CopyUnicodeTo(BeginReading(start), EndReading(end), mutable_this->mFlattenedRep);
        }

        mutable_this->mIsDirty = false;
      }

    return mFlattenedRep;
  }

nsScannerIterator&
nsScannerSubstring::BeginReading( nsScannerIterator& iter ) const
  {
    iter.mOwner = this;

    iter.mFragment.mBuffer = mStart.mBuffer;
    iter.mFragment.mFragmentStart = mStart.mPosition;
    if (mStart.mBuffer == mEnd.mBuffer)
      iter.mFragment.mFragmentEnd = mEnd.mPosition;
    else
      iter.mFragment.mFragmentEnd = mStart.mBuffer->DataEnd();

    iter.mPosition = mStart.mPosition;
    iter.normalize_forward();
    return iter;
  }

nsScannerIterator&
nsScannerSubstring::EndReading( nsScannerIterator& iter ) const
  {
    iter.mOwner = this;

    iter.mFragment.mBuffer = mEnd.mBuffer;
    iter.mFragment.mFragmentEnd = mEnd.mPosition;
    if (mStart.mBuffer == mEnd.mBuffer)
      iter.mFragment.mFragmentStart = mStart.mPosition;
    else
      iter.mFragment.mFragmentStart = mEnd.mBuffer->DataStart();

    iter.mPosition = mEnd.mPosition;
    // must not |normalize_backward| as that would likely invalidate tests like |while ( first != last )|
    return iter;
  }

bool
nsScannerSubstring::GetNextFragment( nsScannerFragment& frag ) const
  {
    // check to see if we are at the end of the buffer list
    if (frag.mBuffer == mEnd.mBuffer)
      return false;

    frag.mBuffer = frag.mBuffer->getNext();

    if (frag.mBuffer == mStart.mBuffer)
      frag.mFragmentStart = mStart.mPosition;
    else
      frag.mFragmentStart = frag.mBuffer->DataStart();

    if (frag.mBuffer == mEnd.mBuffer)
      frag.mFragmentEnd = mEnd.mPosition;
    else
      frag.mFragmentEnd = frag.mBuffer->DataEnd();

    return true;
  }

bool
nsScannerSubstring::GetPrevFragment( nsScannerFragment& frag ) const
  {
    // check to see if we are at the beginning of the buffer list
    if (frag.mBuffer == mStart.mBuffer)
      return false;

    frag.mBuffer = frag.mBuffer->getPrevious();

    if (frag.mBuffer == mStart.mBuffer)
      frag.mFragmentStart = mStart.mPosition;
    else
      frag.mFragmentStart = frag.mBuffer->DataStart();

    if (frag.mBuffer == mEnd.mBuffer)
      frag.mFragmentEnd = mEnd.mPosition;
    else
      frag.mFragmentEnd = frag.mBuffer->DataEnd();

    return true;
  }


  /**
   * nsScannerString
   */

nsScannerString::nsScannerString( Buffer* aBuf )
  {
    mBufferList = new nsScannerBufferList(aBuf);

    init_range_from_buffer_list();
    acquire_ownership_of_buffer_list();
  }

void
nsScannerString::AppendBuffer( Buffer* aBuf )
  {
    mBufferList->Append(aBuf);
    mLength += aBuf->DataLength();

    mEnd.mBuffer = aBuf;
    mEnd.mPosition = aBuf->DataEnd();

    mIsDirty = true;
  }

void
nsScannerString::DiscardPrefix( const nsScannerIterator& aIter )
  {
    Position old_start(mStart);
    mStart = aIter;
    mLength -= Position::Distance(old_start, mStart);
    
    mStart.mBuffer->IncrementUsageCount();
    old_start.mBuffer->DecrementUsageCount();

    mBufferList->DiscardUnreferencedPrefix(old_start.mBuffer);

    mIsDirty = true;
  }

void
nsScannerString::UngetReadable( const nsAString& aReadable, const nsScannerIterator& aInsertPoint )
    /*
     * Warning: this routine manipulates the shared buffer list in an unexpected way.
     *  The original design did not really allow for insertions, but this call promises
     *  that if called for a point after the end of all extant token strings, that no token string
     *  or the work string will be invalidated.
     *
     *  This routine is protected because it is the responsibility of the derived class to keep those promises.
     */
  {
    Position insertPos(aInsertPoint);

    mBufferList->SplitBuffer(insertPos);
      // splitting to the right keeps the work string and any extant token pointing to and
      //  holding a reference count on the same buffer

    Buffer* new_buffer = AllocBufferFromString(aReadable);
      // make a new buffer with all the data to insert...
      //  BULLSHIT ALERT: we may have empty space to re-use in the split buffer, measure the cost
      //  of this and decide if we should do the work to fill it

    Buffer* buffer_to_split = insertPos.mBuffer;
    mBufferList->InsertAfter(new_buffer, buffer_to_split);
    mLength += aReadable.Length();

    mEnd.mBuffer = mBufferList->Tail();
    mEnd.mPosition = mEnd.mBuffer->DataEnd();

    mIsDirty = true;
  }

void
nsScannerString::ReplaceCharacter(nsScannerIterator& aPosition, char16_t aChar)
  {
    // XXX Casting a const to non-const. Unless the base class
    // provides support for writing iterators, this is the best
    // that can be done.
    char16_t* pos = const_cast<char16_t*>(aPosition.get());
    *pos = aChar;

    mIsDirty = true;
  }


  /**
   * nsScannerSharedSubstring
   */

void
nsScannerSharedSubstring::Rebind(const nsScannerIterator &aStart,
                              const nsScannerIterator &aEnd)
{
  // If the start and end positions are inside the same buffer, we must
  // acquire ownership of the buffer.  If not, we can optimize by not holding
  // onto it.

  Buffer *buffer = const_cast<Buffer*>(aStart.buffer());
  bool sameBuffer = buffer == aEnd.buffer();

  nsScannerBufferList *bufferList;

  if (sameBuffer) {
    bufferList = aStart.mOwner->mBufferList;
    bufferList->AddRef();
    buffer->IncrementUsageCount();
  }

  if (mBufferList)
    ReleaseBuffer();

  if (sameBuffer) {
    mBuffer = buffer;
    mBufferList = bufferList;
    mString.Rebind(aStart.mPosition, aEnd.mPosition);
  } else {
    mBuffer = nullptr;
    mBufferList = nullptr;
    CopyUnicodeTo(aStart, aEnd, mString);
  }
}

void
nsScannerSharedSubstring::ReleaseBuffer()
{
  NS_ASSERTION(mBufferList, "Should only be called with non-null mBufferList");
  mBuffer->DecrementUsageCount();
  mBufferList->DiscardUnreferencedPrefix(mBuffer);
  mBufferList->Release();
}

void
nsScannerSharedSubstring::MakeMutable()
{
  nsString temp(mString); // this will force a copy of the data
  mString.Assign(temp);   // mString will now share the just-allocated buffer

  ReleaseBuffer();

  mBuffer = nullptr;
  mBufferList = nullptr;
}

  /**
   * utils -- based on code from nsReadableUtils.cpp
   */

// private helper function
static inline
nsAString::iterator&
copy_multifragment_string( nsScannerIterator& first, const nsScannerIterator& last, nsAString::iterator& result )
  {
    typedef nsCharSourceTraits<nsScannerIterator> source_traits;
    typedef nsCharSinkTraits<nsAString::iterator> sink_traits;

    while ( first != last )
      {
        uint32_t distance = source_traits::readable_distance(first, last);
        sink_traits::write(result, source_traits::read(first), distance);
        NS_ASSERTION(distance > 0, "|copy_multifragment_string| will never terminate");
        source_traits::advance(first, distance);
      }

    return result;
  }

void
CopyUnicodeTo( const nsScannerIterator& aSrcStart,
               const nsScannerIterator& aSrcEnd,
               nsAString& aDest )
  {
    nsAString::iterator writer;
    if (!aDest.SetLength(Distance(aSrcStart, aSrcEnd), mozilla::fallible)) {
      aDest.Truncate();
      return; // out of memory
    }
    aDest.BeginWriting(writer);
    nsScannerIterator fromBegin(aSrcStart);
    
    copy_multifragment_string(fromBegin, aSrcEnd, writer);
  }

void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
                 const nsScannerIterator& aSrcEnd,
                 nsScannerSharedSubstring& aDest )
  {
    // Check whether we can just create a dependent string.
    if (aDest.str().IsEmpty()) {
      // We can just make |aDest| point to the buffer.
      // This will take care of copying if the buffer spans fragments.
      aDest.Rebind(aSrcStart, aSrcEnd);
    } else {
      // The dest string is not empty, so it can't be a dependent substring.
      AppendUnicodeTo(aSrcStart, aSrcEnd, aDest.writable());
    }
  }

void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
                 const nsScannerIterator& aSrcEnd,
                 nsAString& aDest )
  {
    nsAString::iterator writer;
    uint32_t oldLength = aDest.Length();
    if (!aDest.SetLength(oldLength + Distance(aSrcStart, aSrcEnd), mozilla::fallible))
      return; // out of memory
    aDest.BeginWriting(writer).advance(oldLength);
    nsScannerIterator fromBegin(aSrcStart);
    
    copy_multifragment_string(fromBegin, aSrcEnd, writer);
  }

bool
FindCharInReadable( char16_t aChar,
                    nsScannerIterator& aSearchStart,
                    const nsScannerIterator& aSearchEnd )
  {
    while ( aSearchStart != aSearchEnd )
      {
        int32_t fragmentLength;
        if ( SameFragment(aSearchStart, aSearchEnd) ) 
          fragmentLength = aSearchEnd.get() - aSearchStart.get();
        else
          fragmentLength = aSearchStart.size_forward();

        const char16_t* charFoundAt = nsCharTraits<char16_t>::find(aSearchStart.get(), fragmentLength, aChar);
        if ( charFoundAt ) {
          aSearchStart.advance( charFoundAt - aSearchStart.get() );
          return true;
        }

        aSearchStart.advance(fragmentLength);
      }

    return false;
  }

bool
FindInReadable( const nsAString& aPattern,
                nsScannerIterator& aSearchStart,
                nsScannerIterator& aSearchEnd,
                const nsStringComparator& compare )
  {
    bool found_it = false;

      // only bother searching at all if we're given a non-empty range to search
    if ( aSearchStart != aSearchEnd )
      {
        nsAString::const_iterator aPatternStart, aPatternEnd;
        aPattern.BeginReading(aPatternStart);
        aPattern.EndReading(aPatternEnd);

          // outer loop keeps searching till we find it or run out of string to search
        while ( !found_it )
          {
              // fast inner loop (that's what it's called, not what it is) looks for a potential match
            while ( aSearchStart != aSearchEnd &&
                    compare(aPatternStart.get(), aSearchStart.get(), 1, 1) )
              ++aSearchStart;

              // if we broke out of the `fast' loop because we're out of string ... we're done: no match
            if ( aSearchStart == aSearchEnd )
              break;

              // otherwise, we're at a potential match, let's see if we really hit one
            nsAString::const_iterator testPattern(aPatternStart);
            nsScannerIterator testSearch(aSearchStart);

              // slow inner loop verifies the potential match (found by the `fast' loop) at the current position
            for(;;)
              {
                  // we already compared the first character in the outer loop,
                  //  so we'll advance before the next comparison
                ++testPattern;
                ++testSearch;

                  // if we verified all the way to the end of the pattern, then we found it!
                if ( testPattern == aPatternEnd )
                  {
                    found_it = true;
                    aSearchEnd = testSearch; // return the exact found range through the parameters
                    break;
                  }

                  // if we got to end of the string we're searching before we hit the end of the
                  //  pattern, we'll never find what we're looking for
                if ( testSearch == aSearchEnd )
                  {
                    aSearchStart = aSearchEnd;
                    break;
                  }

                  // else if we mismatched ... it's time to advance to the next search position
                  //  and get back into the `fast' loop
                if ( compare(testPattern.get(), testSearch.get(), 1, 1) )
                  {
                    ++aSearchStart;
                    break;
                  }
              }
          }
      }

    return found_it;
  }

  /**
   * This implementation is simple, but does too much work.
   * It searches the entire string from left to right, and returns the last match found, if any.
   * This implementation will be replaced when I get |reverse_iterator|s working.
   */
bool
RFindInReadable( const nsAString& aPattern,
                 nsScannerIterator& aSearchStart,
                 nsScannerIterator& aSearchEnd,
                 const nsStringComparator& aComparator )
  {
    bool found_it = false;

    nsScannerIterator savedSearchEnd(aSearchEnd);
    nsScannerIterator searchStart(aSearchStart), searchEnd(aSearchEnd);

    while ( searchStart != searchEnd )
      {
        if ( FindInReadable(aPattern, searchStart, searchEnd, aComparator) )
          {
            found_it = true;

              // this is the best match so far, so remember it
            aSearchStart = searchStart;
            aSearchEnd = searchEnd;

              // ...and get ready to search some more
              //  (it's tempting to set |searchStart=searchEnd| ... but that misses overlapping patterns)
            ++searchStart;
            searchEnd = savedSearchEnd;
          }
      }

      // if we never found it, return an empty range
    if ( !found_it )
      aSearchStart = aSearchEnd;

    return found_it;
  }