/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
//#define __INCREMENTAL 1
#define NS_IMPL_IDS
#include "nsScanner.h"
#include "nsDebug.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
#include "nsICharsetAlias.h"
#include "nsFileSpec.h"
static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
const char* kBadHTMLText="
Oops...
You just tried to read a non-existent document:
";
const char* kUnorderedStringError = "String argument must be ordered. Don't you read API's?";
#ifdef __INCREMENTAL
const int kBufsize=1;
#else
const int kBufsize=64;
#endif
MOZ_DECL_CTOR_COUNTER(nsScanner);
/**
* Use this constructor if you want i/o to be based on
* a single string you hand in during construction.
* This short cut was added for Javascript.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
nsScanner::nsScanner(nsString& anHTMLString, const nsString& aCharset, nsCharsetSource aSource) :
mBuffer(anHTMLString), mFilename(""), mUnicodeXferBuf("")
{
MOZ_COUNT_CTOR(nsScanner);
mTotalRead=mBuffer.Length();
mIncremental=PR_FALSE;
mOwnsStream=PR_FALSE;
mOffset=0;
mMarkPos=0;
mInputStream=0;
mUnicodeDecoder = 0;
mCharset = "";
mCharsetSource = kCharsetUninitialized;
SetDocumentCharset(aCharset, aSource);
mNewlinesSkipped=0;
}
/**
* Use this constructor if you want i/o to be based on strings
* the scanner receives. If you pass a null filename, you
* can still provide data to the scanner via append.
*
* @update gess 5/12/98
* @param aFilename --
* @return
*/
nsScanner::nsScanner(nsString& aFilename,PRBool aCreateStream, const nsString& aCharset, nsCharsetSource aSource) :
mBuffer(""), mFilename(aFilename), mUnicodeXferBuf("")
{
MOZ_COUNT_CTOR(nsScanner);
mIncremental=PR_TRUE;
mOffset=0;
mMarkPos=0;
mTotalRead=0;
mOwnsStream=aCreateStream;
mInputStream=0;
if(aCreateStream) {
mInputStream = new nsInputFileStream(nsFileSpec(aFilename));
} //if
mUnicodeDecoder = 0;
mCharset = "";
mCharsetSource = kCharsetUninitialized;
SetDocumentCharset(aCharset, aSource);
mNewlinesSkipped=0;
}
/**
* Use this constructor if you want i/o to be stream based.
*
* @update gess 5/12/98
* @param aStream --
* @param assumeOwnership --
* @param aFilename --
* @return
*/
nsScanner::nsScanner(nsString& aFilename,nsInputStream& aStream,const nsString& aCharset, nsCharsetSource aSource) :
mBuffer(""), mFilename(aFilename) , mUnicodeXferBuf("")
{
MOZ_COUNT_CTOR(nsScanner);
mIncremental=PR_FALSE;
mOffset=0;
mMarkPos=0;
mTotalRead=0;
mOwnsStream=PR_FALSE;
mInputStream=&aStream;
mUnicodeDecoder = 0;
mCharset = "";
mCharsetSource = kCharsetUninitialized;
SetDocumentCharset(aCharset, aSource);
mNewlinesSkipped=0;
}
nsresult nsScanner::SetDocumentCharset(const nsString& aCharset , nsCharsetSource aSource) {
nsresult res = NS_OK;
if( aSource < mCharsetSource) // priority is lower the the current one , just
return res;
NS_WITH_SERVICE(nsICharsetAlias, calias, kCharsetAliasCID, &res);
NS_ASSERTION( nsnull != calias, "cannot find charset alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
PRBool same = PR_FALSE;
res = calias->Equals(aCharset, mCharset, &same);
if(NS_SUCCEEDED(res) && same)
{
return NS_OK; // no difference, don't change it
}
// different, need to change it
res = calias->GetPreferred(aCharset, charsetName);
if(NS_FAILED(res) && (kCharsetUninitialized == mCharsetSource) )
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
mCharset = charsetName;
mCharsetSource = aSource;
NS_WITH_SERVICE(nsICharsetConverterManager, ccm, kCharsetConverterManagerCID, &res);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeDecoder * decoder = nsnull;
res = ccm->GetUnicodeDecoder(&mCharset, &decoder);
if(NS_SUCCEEDED(res) && (nsnull != decoder))
{
NS_IF_RELEASE(mUnicodeDecoder);
mUnicodeDecoder = decoder;
}
}
}
return res;
}
/**
* default destructor
*
* @update gess 3/25/98
* @param
* @return
*/
nsScanner::~nsScanner() {
MOZ_COUNT_DTOR(nsScanner);
if(mInputStream) {
mInputStream->close();
if(mOwnsStream)
delete mInputStream;
}
mInputStream=0;
NS_IF_RELEASE(mUnicodeDecoder);
}
/**
* Resets current offset position of input stream to marked position.
* This allows us to back up to this point if the need should arise,
* such as when tokenization gets interrupted.
* NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
*
* @update gess 5/12/98
* @param
* @return
*/
PRUint32 nsScanner::RewindToMark(void){
mOffset=mMarkPos;
return mOffset;
}
/**
* Records current offset position in input stream. This allows us
* to back up to this point if the need should arise, such as when
* tokenization gets interrupted.
*
* @update gess 7/29/98
* @param
* @return
*/
PRUint32 nsScanner::Mark(PRInt32 anIndex){
if(kNotFound==anIndex) {
if((mOffset>0) && (mOffset>eBufferSizeThreshold)) {
mBuffer.Cut(0,mOffset); //delete chars up to mark position
mOffset=0;
}
mMarkPos=mOffset;
}
else mOffset=(PRUint32)anIndex;
return 0;
}
/**
* Insert data to our underlying input buffer as
* if it were read from an input stream.
*
* @update harishd 01/12/99
* @return error code
*/
PRBool nsScanner::Insert(const nsString& aBuffer) {
PRInt32 theLen=aBuffer.Length();
mBuffer.Insert(aBuffer,mOffset,theLen);
mTotalRead+=theLen;
return PR_TRUE;
}
/**
* Append data to our underlying input buffer as
* if it were read from an input stream.
*
* @update gess4/3/98
* @return error code
*/
PRBool nsScanner::Append(const nsString& aBuffer) {
PRInt32 theLen=mBuffer.Length();
mBuffer.Append(aBuffer);
mTotalRead+=aBuffer.Length();
if(theLenGetMaxLength(aBuffer, aLen, &unicharBufLen);
mUnicodeXferBuf.SetCapacity(unicharBufLen+32);
mUnicodeXferBuf.Truncate();
PRUnichar *unichars = (PRUnichar*)mUnicodeXferBuf.GetUnicode();
nsresult res;
do {
PRInt32 srcLength = aLen;
PRInt32 unicharLength = unicharBufLen;
res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength);
unichars[unicharLength]=0; //add this since the unicode converters can't be trusted to do so.
mBuffer.Append(unichars, unicharLength);
mTotalRead += unicharLength;
// if we failed, we consume one byte by replace it with U+FFFD
// and try conversion again.
if(NS_FAILED(res)) {
mUnicodeDecoder->Reset();
mBuffer.Append( (PRUnichar)0xFFFD);
mTotalRead++;
if(((PRUint32) (srcLength + 1)) > aLen)
srcLength = aLen;
else
srcLength++;
aBuffer += srcLength;
aLen -= srcLength;
}
} while (NS_FAILED(res) && (aLen > 0));
// we continue convert the bytes data into Unicode
// if we have conversion error and we have more data.
// delete[] unichars;
}
else {
mBuffer.Append(aBuffer,aLen);
mTotalRead+=aLen;
}
if(theLenread(buf, kBufsize);
if (0 == numread) {
return kEOF;
}
}
mOffset=mBuffer.Length();
if((0=(PRUint32)mBuffer.Length()) {
theError=FillBuffer();
}
if(NS_OK==theError) {
if (0==(PRUint32)mBuffer.Length()) {
return kEOF;
}
}
return theError;
}
/**
* retrieve next char from scanners internal input stream
*
* @update gess 3/25/98
* @param
* @return error code reflecting read status
*/
nsresult nsScanner::GetChar(PRUnichar& aChar) {
nsresult result=NS_OK;
aChar=0;
if(mOffset>=(PRUint32)mBuffer.Length())
result=Eof();
if(NS_OK == result){
aChar=GetCharAt(mBuffer,mOffset++);
}
return result;
}
/**
* peek ahead to consume next char from scanner's internal
* input buffer
*
* @update gess 3/25/98
* @param
* @return
*/
nsresult nsScanner::Peek(PRUnichar& aChar) {
nsresult result=NS_OK;
aChar=0;
if(mOffset>=(PRUint32)mBuffer.Length())
result=Eof();
if(NS_OK == result){
aChar=GetCharAt(mBuffer,mOffset);
}
return result;
}
/**
* Push the given char back onto the scanner
*
* @update gess 3/25/98
* @param
* @return error code
*/
nsresult nsScanner::PutBack(PRUnichar aChar) {
if(mOffset>0)
mOffset--;
else mBuffer.Insert(aChar,0);
return NS_OK;
}
/**
* Skip whitespace on scanner input stream
*
* @update gess 3/25/98
* @param
* @return error status
*/
nsresult nsScanner::SkipWhitespace(void) {
PRUnichar theChar=0;
nsresult result=Peek(theChar);
const PRUnichar* theBuf=mBuffer.GetUnicode();
PRInt32 theOrigin=mOffset;
PRBool found=PR_FALSE;
mNewlinesSkipped = 0;
while(NS_OK==result) {
theChar=theBuf[mOffset++];
if(theChar) {
switch(theChar) {
case '\n': mNewlinesSkipped++;
case ' ' :
case '\r':
case '\b':
case '\t':
found=PR_TRUE;
break;
default:
found=PR_FALSE;
break;
}
if(!found) {
mOffset-=1;
break;
}
}
else if ((PRUint32)mBuffer.Length()<=mOffset) {
mOffset-=1;
result=Peek(theChar);
theBuf=mBuffer.GetUnicode();
theOrigin=mOffset;
}
}
//DoErrTest(aString);
return result;
}
/**
* Skip over chars as long as they equal given char
*
* @update gess 3/25/98
* @param
* @return error code
*/
nsresult nsScanner::SkipOver(PRUnichar aSkipChar){
PRUnichar ch=0;
nsresult result=NS_OK;
while(NS_OK==result) {
result=GetChar(ch);
if(NS_OK == result) {
if(ch!=aSkipChar) {
PutBack(ch);
break;
}
}
else break;
} //while
return result;
}
/**
* Skip over chars as long as they're in aSkipSet
*
* @update gess 3/25/98
* @param aSkipSet is an ordered string.
* @return error code
*/
nsresult nsScanner::SkipOver(nsString& aSkipSet){
PRUnichar theChar=0;
nsresult result=NS_OK;
while(NS_OK==result) {
result=GetChar(theChar);
if(NS_OK == result) {
PRInt32 pos=aSkipSet.FindChar(theChar);
if(kNotFound==pos) {
PutBack(theChar);
break;
}
}
else break;
} //while
return result;
}
/**
* Skip over chars until they're in aValidSet
*
* @update gess 3/25/98
* @param aValid set is an ordered string that
* contains chars you're looking for
* @return error code
*/
nsresult nsScanner::SkipTo(nsString& aValidSet){
PRUnichar ch=0;
nsresult result=NS_OK;
while(NS_OK==result) {
result=GetChar(ch);
if(NS_OK == result) {
PRInt32 pos=aValidSet.FindChar(ch);
if(kNotFound!=pos) {
PutBack(ch);
break;
}
}
else break;
} //while
return result;
}
#if 0
void DoErrTest(nsString& aString) {
PRInt32 pos=aString.FindChar(0);
if(kNotFoundin the
* given validSet of input chars.
*
* @update gess 3/25/98
* @param aString will contain the result of this method
* @param aValidSet is an ordered string that contains the
* valid characters
* @return error code
*/
nsresult nsScanner::ReadWhile(nsString& aString,
nsString& aValidSet,
PRBool anOrderedSet,
PRBool addTerminal){
NS_ASSERTION(((PR_FALSE==anOrderedSet) || aValidSet.IsOrdered()),kUnorderedStringError);
PRUnichar theChar=0;
nsresult result=Peek(theChar);
const PRUnichar* theBuf=mBuffer.GetUnicode();
PRInt32 theOrigin=mOffset;
while(NS_OK==result) {
theChar=theBuf[mOffset++];
if(theChar) {
PRInt32 pos=(anOrderedSet) ? aValidSet.BinarySearch(theChar) : aValidSet.FindChar(theChar);
if(kNotFound==pos) {
if(!addTerminal)
mOffset-=1;
aString.Append(&theBuf[theOrigin],mOffset-theOrigin);
break;
}
}
else if ((PRUint32)mBuffer.Length()<=mOffset) {
mOffset -= 1;
aString.Append(&theBuf[theOrigin],mOffset-theOrigin);
result=Peek(theChar);
theBuf=mBuffer.GetUnicode();
theOrigin=mOffset;
}
}
//DoErrTest(aString);
return result;
}
/**
* Consume chars as long as they are in the
* given validSet of input chars.
*
* @update gess 3/25/98
* @param aString will contain the result of this method
* @param aValidSet is an ordered string that contains the
* valid characters
* @return error code
*/
nsresult nsScanner::ReadWhile(nsString& aString,
nsCString& aValidSet,
PRBool anOrderedSet,
PRBool addTerminal){
NS_ASSERTION(((PR_FALSE==anOrderedSet) || aValidSet.IsOrdered()),kUnorderedStringError);
PRUnichar theChar=0;
nsresult result=Peek(theChar);
const PRUnichar* theBuf=mBuffer.GetUnicode();
PRInt32 theOrigin=mOffset;
while(NS_OK==result) {
theChar=theBuf[mOffset++];
if(theChar) {
PRInt32 pos=(anOrderedSet) ? aValidSet.BinarySearch(theChar) : aValidSet.FindChar(theChar);
if(kNotFound==pos) {
if(!addTerminal)
mOffset-=1;
aString.Append(&theBuf[theOrigin],mOffset-theOrigin);
break;
}
}
else if ((PRUint32)mBuffer.Length()<=mOffset) {
mOffset -= 1;
aString.Append(&theBuf[theOrigin],mOffset-theOrigin);
result=Peek(theChar);
theBuf=mBuffer.GetUnicode();
theOrigin=mOffset;
}
}
//DoErrTest(aString);
return result;
}
/**
* Consume chars as long as they are in the
* given validSet of input chars.
*
* @update gess 3/25/98
* @param aString will contain the result of this method
* @param anInputSet contains the legal input chars
* valid characters
* @return error code
*/
nsresult nsScanner::ReadWhile(nsString& aString,
const char* anInputSet,
PRBool anOrderedSet,
PRBool addTerminal)
{
nsresult result=NS_OK;
if(anInputSet) {
PRInt32 len=nsCRT::strlen(anInputSet);
if(0