gecko-dev/parser/htmlparser/nsScannerString.cpp

649 строки
19 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <stdlib.h>
#include "nsScannerString.h"
/**
* nsScannerBufferList
*/
#define MAX_CAPACITY ((UINT32_MAX / sizeof(char16_t)) - \
(sizeof(Buffer) + sizeof(char16_t)))
nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBufferFromString( const nsAString& aString )
{
uint32_t len = aString.Length();
Buffer* buf = AllocBuffer(len);
if (buf)
{
nsAString::const_iterator source;
aString.BeginReading(source);
nsCharTraits<char16_t>::copy(buf->DataStart(), source.get(), len);
}
return buf;
}
nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBuffer( uint32_t capacity )
{
if (capacity > MAX_CAPACITY)
return nullptr;
void* ptr = malloc(sizeof(Buffer) + (capacity + 1) * sizeof(char16_t));
if (!ptr)
return nullptr;
Buffer* buf = new (ptr) Buffer();
buf->mUsageCount = 0;
buf->mDataEnd = buf->DataStart() + capacity;
// XXX null terminate. this shouldn't be required, but we do it because
// nsScanner erroneously thinks it can dereference DataEnd :-(
*buf->mDataEnd = char16_t(0);
return buf;
}
void
nsScannerBufferList::ReleaseAll()
{
while (!mBuffers.isEmpty())
{
Buffer* node = mBuffers.popFirst();
//printf(">>> freeing buffer @%p\n", node);
free(node);
}
}
void
nsScannerBufferList::SplitBuffer( const Position& pos )
{
// splitting to the right keeps the work string and any extant token
// pointing to and holding a reference count on the same buffer.
Buffer* bufferToSplit = pos.mBuffer;
NS_ASSERTION(bufferToSplit, "null pointer");
uint32_t splitOffset = pos.mPosition - bufferToSplit->DataStart();
NS_ASSERTION(pos.mPosition >= bufferToSplit->DataStart() &&
splitOffset <= bufferToSplit->DataLength(),
"split offset is outside buffer");
uint32_t len = bufferToSplit->DataLength() - splitOffset;
Buffer* new_buffer = AllocBuffer(len);
if (new_buffer)
{
nsCharTraits<char16_t>::copy(new_buffer->DataStart(),
bufferToSplit->DataStart() + splitOffset,
len);
InsertAfter(new_buffer, bufferToSplit);
bufferToSplit->SetDataLength(splitOffset);
}
}
void
nsScannerBufferList::DiscardUnreferencedPrefix( Buffer* aBuf )
{
if (aBuf == Head())
{
while (!mBuffers.isEmpty() && !Head()->IsInUse())
{
Buffer* buffer = Head();
buffer->remove();
free(buffer);
}
}
}
size_t
nsScannerBufferList::Position::Distance( const Position& aStart, const Position& aEnd )
{
size_t result = 0;
if (aStart.mBuffer == aEnd.mBuffer)
{
result = aEnd.mPosition - aStart.mPosition;
}
else
{
result = aStart.mBuffer->DataEnd() - aStart.mPosition;
for (Buffer* b = aStart.mBuffer->Next(); b != aEnd.mBuffer; b = b->Next())
result += b->DataLength();
result += aEnd.mPosition - aEnd.mBuffer->DataStart();
}
return result;
}
/**
* nsScannerSubstring
*/
nsScannerSubstring::nsScannerSubstring()
: mStart(nullptr, nullptr)
, mEnd(nullptr, nullptr)
, mBufferList(nullptr)
, mLength(0)
, mIsDirty(true)
{
}
nsScannerSubstring::nsScannerSubstring( const nsAString& s )
: mBufferList(nullptr)
, mIsDirty(true)
{
Rebind(s);
}
nsScannerSubstring::~nsScannerSubstring()
{
release_ownership_of_buffer_list();
}
int32_t
nsScannerSubstring::CountChar( char16_t c ) const
{
/*
re-write this to use a counting sink
*/
size_type result = 0;
size_type lengthToExamine = Length();
nsScannerIterator iter;
for ( BeginReading(iter); ; )
{
int32_t lengthToExamineInThisFragment = iter.size_forward();
const char16_t* fromBegin = iter.get();
result += size_type(NS_COUNT(fromBegin, fromBegin+lengthToExamineInThisFragment, c));
if ( !(lengthToExamine -= lengthToExamineInThisFragment) )
return result;
iter.advance(lengthToExamineInThisFragment);
}
// never reached; quiets warnings
return 0;
}
void
nsScannerSubstring::Rebind( const nsScannerSubstring& aString,
const nsScannerIterator& aStart,
const nsScannerIterator& aEnd )
{
// allow for the case where &aString == this
aString.acquire_ownership_of_buffer_list();
release_ownership_of_buffer_list();
mStart = aStart;
mEnd = aEnd;
mBufferList = aString.mBufferList;
mLength = Distance(aStart, aEnd);
mIsDirty = true;
}
void
nsScannerSubstring::Rebind( const nsAString& aString )
{
release_ownership_of_buffer_list();
mBufferList = new nsScannerBufferList(AllocBufferFromString(aString));
mIsDirty = true;
init_range_from_buffer_list();
acquire_ownership_of_buffer_list();
}
const nsSubstring&
nsScannerSubstring::AsString() const
{
if (mIsDirty)
{
nsScannerSubstring* mutable_this = const_cast<nsScannerSubstring*>(this);
if (mStart.mBuffer == mEnd.mBuffer) {
// We only have a single fragment to deal with, so just return it
// as a substring.
mutable_this->mFlattenedRep.Rebind(mStart.mPosition, mEnd.mPosition);
} else {
// Otherwise, we need to copy the data into a flattened buffer.
nsScannerIterator start, end;
CopyUnicodeTo(BeginReading(start), EndReading(end), mutable_this->mFlattenedRep);
}
mutable_this->mIsDirty = false;
}
return mFlattenedRep;
}
nsScannerIterator&
nsScannerSubstring::BeginReading( nsScannerIterator& iter ) const
{
iter.mOwner = this;
iter.mFragment.mBuffer = mStart.mBuffer;
iter.mFragment.mFragmentStart = mStart.mPosition;
if (mStart.mBuffer == mEnd.mBuffer)
iter.mFragment.mFragmentEnd = mEnd.mPosition;
else
iter.mFragment.mFragmentEnd = mStart.mBuffer->DataEnd();
iter.mPosition = mStart.mPosition;
iter.normalize_forward();
return iter;
}
nsScannerIterator&
nsScannerSubstring::EndReading( nsScannerIterator& iter ) const
{
iter.mOwner = this;
iter.mFragment.mBuffer = mEnd.mBuffer;
iter.mFragment.mFragmentEnd = mEnd.mPosition;
if (mStart.mBuffer == mEnd.mBuffer)
iter.mFragment.mFragmentStart = mStart.mPosition;
else
iter.mFragment.mFragmentStart = mEnd.mBuffer->DataStart();
iter.mPosition = mEnd.mPosition;
// must not |normalize_backward| as that would likely invalidate tests like |while ( first != last )|
return iter;
}
bool
nsScannerSubstring::GetNextFragment( nsScannerFragment& frag ) const
{
// check to see if we are at the end of the buffer list
if (frag.mBuffer == mEnd.mBuffer)
return false;
frag.mBuffer = frag.mBuffer->getNext();
if (frag.mBuffer == mStart.mBuffer)
frag.mFragmentStart = mStart.mPosition;
else
frag.mFragmentStart = frag.mBuffer->DataStart();
if (frag.mBuffer == mEnd.mBuffer)
frag.mFragmentEnd = mEnd.mPosition;
else
frag.mFragmentEnd = frag.mBuffer->DataEnd();
return true;
}
bool
nsScannerSubstring::GetPrevFragment( nsScannerFragment& frag ) const
{
// check to see if we are at the beginning of the buffer list
if (frag.mBuffer == mStart.mBuffer)
return false;
frag.mBuffer = frag.mBuffer->getPrevious();
if (frag.mBuffer == mStart.mBuffer)
frag.mFragmentStart = mStart.mPosition;
else
frag.mFragmentStart = frag.mBuffer->DataStart();
if (frag.mBuffer == mEnd.mBuffer)
frag.mFragmentEnd = mEnd.mPosition;
else
frag.mFragmentEnd = frag.mBuffer->DataEnd();
return true;
}
/**
* nsScannerString
*/
nsScannerString::nsScannerString( Buffer* aBuf )
{
mBufferList = new nsScannerBufferList(aBuf);
init_range_from_buffer_list();
acquire_ownership_of_buffer_list();
}
void
nsScannerString::AppendBuffer( Buffer* aBuf )
{
mBufferList->Append(aBuf);
mLength += aBuf->DataLength();
mEnd.mBuffer = aBuf;
mEnd.mPosition = aBuf->DataEnd();
mIsDirty = true;
}
void
nsScannerString::DiscardPrefix( const nsScannerIterator& aIter )
{
Position old_start(mStart);
mStart = aIter;
mLength -= Position::Distance(old_start, mStart);
mStart.mBuffer->IncrementUsageCount();
old_start.mBuffer->DecrementUsageCount();
mBufferList->DiscardUnreferencedPrefix(old_start.mBuffer);
mIsDirty = true;
}
void
nsScannerString::UngetReadable( const nsAString& aReadable, const nsScannerIterator& aInsertPoint )
/*
* Warning: this routine manipulates the shared buffer list in an unexpected way.
* The original design did not really allow for insertions, but this call promises
* that if called for a point after the end of all extant token strings, that no token string
* or the work string will be invalidated.
*
* This routine is protected because it is the responsibility of the derived class to keep those promises.
*/
{
Position insertPos(aInsertPoint);
mBufferList->SplitBuffer(insertPos);
// splitting to the right keeps the work string and any extant token pointing to and
// holding a reference count on the same buffer
Buffer* new_buffer = AllocBufferFromString(aReadable);
// make a new buffer with all the data to insert...
// BULLSHIT ALERT: we may have empty space to re-use in the split buffer, measure the cost
// of this and decide if we should do the work to fill it
Buffer* buffer_to_split = insertPos.mBuffer;
mBufferList->InsertAfter(new_buffer, buffer_to_split);
mLength += aReadable.Length();
mEnd.mBuffer = mBufferList->Tail();
mEnd.mPosition = mEnd.mBuffer->DataEnd();
mIsDirty = true;
}
void
nsScannerString::ReplaceCharacter(nsScannerIterator& aPosition, char16_t aChar)
{
// XXX Casting a const to non-const. Unless the base class
// provides support for writing iterators, this is the best
// that can be done.
char16_t* pos = const_cast<char16_t*>(aPosition.get());
*pos = aChar;
mIsDirty = true;
}
/**
* nsScannerSharedSubstring
*/
void
nsScannerSharedSubstring::Rebind(const nsScannerIterator &aStart,
const nsScannerIterator &aEnd)
{
// If the start and end positions are inside the same buffer, we must
// acquire ownership of the buffer. If not, we can optimize by not holding
// onto it.
Buffer *buffer = const_cast<Buffer*>(aStart.buffer());
bool sameBuffer = buffer == aEnd.buffer();
nsScannerBufferList *bufferList;
if (sameBuffer) {
bufferList = aStart.mOwner->mBufferList;
bufferList->AddRef();
buffer->IncrementUsageCount();
}
if (mBufferList)
ReleaseBuffer();
if (sameBuffer) {
mBuffer = buffer;
mBufferList = bufferList;
mString.Rebind(aStart.mPosition, aEnd.mPosition);
} else {
mBuffer = nullptr;
mBufferList = nullptr;
CopyUnicodeTo(aStart, aEnd, mString);
}
}
void
nsScannerSharedSubstring::ReleaseBuffer()
{
NS_ASSERTION(mBufferList, "Should only be called with non-null mBufferList");
mBuffer->DecrementUsageCount();
mBufferList->DiscardUnreferencedPrefix(mBuffer);
mBufferList->Release();
}
void
nsScannerSharedSubstring::MakeMutable()
{
nsString temp(mString); // this will force a copy of the data
mString.Assign(temp); // mString will now share the just-allocated buffer
ReleaseBuffer();
mBuffer = nullptr;
mBufferList = nullptr;
}
/**
* utils -- based on code from nsReadableUtils.cpp
*/
// private helper function
static inline
nsAString::iterator&
copy_multifragment_string( nsScannerIterator& first, const nsScannerIterator& last, nsAString::iterator& result )
{
typedef nsCharSourceTraits<nsScannerIterator> source_traits;
typedef nsCharSinkTraits<nsAString::iterator> sink_traits;
while ( first != last )
{
uint32_t distance = source_traits::readable_distance(first, last);
sink_traits::write(result, source_traits::read(first), distance);
NS_ASSERTION(distance > 0, "|copy_multifragment_string| will never terminate");
source_traits::advance(first, distance);
}
return result;
}
void
CopyUnicodeTo( const nsScannerIterator& aSrcStart,
const nsScannerIterator& aSrcEnd,
nsAString& aDest )
{
nsAString::iterator writer;
if (!aDest.SetLength(Distance(aSrcStart, aSrcEnd), mozilla::fallible)) {
aDest.Truncate();
return; // out of memory
}
aDest.BeginWriting(writer);
nsScannerIterator fromBegin(aSrcStart);
copy_multifragment_string(fromBegin, aSrcEnd, writer);
}
void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
const nsScannerIterator& aSrcEnd,
nsScannerSharedSubstring& aDest )
{
// Check whether we can just create a dependent string.
if (aDest.str().IsEmpty()) {
// We can just make |aDest| point to the buffer.
// This will take care of copying if the buffer spans fragments.
aDest.Rebind(aSrcStart, aSrcEnd);
} else {
// The dest string is not empty, so it can't be a dependent substring.
AppendUnicodeTo(aSrcStart, aSrcEnd, aDest.writable());
}
}
void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
const nsScannerIterator& aSrcEnd,
nsAString& aDest )
{
nsAString::iterator writer;
uint32_t oldLength = aDest.Length();
if (!aDest.SetLength(oldLength + Distance(aSrcStart, aSrcEnd), mozilla::fallible))
return; // out of memory
aDest.BeginWriting(writer).advance(oldLength);
nsScannerIterator fromBegin(aSrcStart);
copy_multifragment_string(fromBegin, aSrcEnd, writer);
}
bool
FindCharInReadable( char16_t aChar,
nsScannerIterator& aSearchStart,
const nsScannerIterator& aSearchEnd )
{
while ( aSearchStart != aSearchEnd )
{
int32_t fragmentLength;
if ( SameFragment(aSearchStart, aSearchEnd) )
fragmentLength = aSearchEnd.get() - aSearchStart.get();
else
fragmentLength = aSearchStart.size_forward();
const char16_t* charFoundAt = nsCharTraits<char16_t>::find(aSearchStart.get(), fragmentLength, aChar);
if ( charFoundAt ) {
aSearchStart.advance( charFoundAt - aSearchStart.get() );
return true;
}
aSearchStart.advance(fragmentLength);
}
return false;
}
bool
FindInReadable( const nsAString& aPattern,
nsScannerIterator& aSearchStart,
nsScannerIterator& aSearchEnd,
const nsStringComparator& compare )
{
bool found_it = false;
// only bother searching at all if we're given a non-empty range to search
if ( aSearchStart != aSearchEnd )
{
nsAString::const_iterator aPatternStart, aPatternEnd;
aPattern.BeginReading(aPatternStart);
aPattern.EndReading(aPatternEnd);
// outer loop keeps searching till we find it or run out of string to search
while ( !found_it )
{
// fast inner loop (that's what it's called, not what it is) looks for a potential match
while ( aSearchStart != aSearchEnd &&
compare(aPatternStart.get(), aSearchStart.get(), 1, 1) )
++aSearchStart;
// if we broke out of the `fast' loop because we're out of string ... we're done: no match
if ( aSearchStart == aSearchEnd )
break;
// otherwise, we're at a potential match, let's see if we really hit one
nsAString::const_iterator testPattern(aPatternStart);
nsScannerIterator testSearch(aSearchStart);
// slow inner loop verifies the potential match (found by the `fast' loop) at the current position
for(;;)
{
// we already compared the first character in the outer loop,
// so we'll advance before the next comparison
++testPattern;
++testSearch;
// if we verified all the way to the end of the pattern, then we found it!
if ( testPattern == aPatternEnd )
{
found_it = true;
aSearchEnd = testSearch; // return the exact found range through the parameters
break;
}
// if we got to end of the string we're searching before we hit the end of the
// pattern, we'll never find what we're looking for
if ( testSearch == aSearchEnd )
{
aSearchStart = aSearchEnd;
break;
}
// else if we mismatched ... it's time to advance to the next search position
// and get back into the `fast' loop
if ( compare(testPattern.get(), testSearch.get(), 1, 1) )
{
++aSearchStart;
break;
}
}
}
}
return found_it;
}
/**
* This implementation is simple, but does too much work.
* It searches the entire string from left to right, and returns the last match found, if any.
* This implementation will be replaced when I get |reverse_iterator|s working.
*/
bool
RFindInReadable( const nsAString& aPattern,
nsScannerIterator& aSearchStart,
nsScannerIterator& aSearchEnd,
const nsStringComparator& aComparator )
{
bool found_it = false;
nsScannerIterator savedSearchEnd(aSearchEnd);
nsScannerIterator searchStart(aSearchStart), searchEnd(aSearchEnd);
while ( searchStart != searchEnd )
{
if ( FindInReadable(aPattern, searchStart, searchEnd, aComparator) )
{
found_it = true;
// this is the best match so far, so remember it
aSearchStart = searchStart;
aSearchEnd = searchEnd;
// ...and get ready to search some more
// (it's tempting to set |searchStart=searchEnd| ... but that misses overlapping patterns)
++searchStart;
searchEnd = savedSearchEnd;
}
}
// if we never found it, return an empty range
if ( !found_it )
aSearchStart = aSearchEnd;
return found_it;
}