/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* * The contents of this file are subject to the Netscape Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/NPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code is mozilla.org code. * * The Initial Developer of the Original Code is Netscape * Communications Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All * Rights Reserved. * * Contributor(s): */ //#define __INCREMENTAL 1 #define NS_IMPL_IDS #include "nsScanner.h" #include "nsDebug.h" #include "nsIServiceManager.h" #include "nsICharsetConverterManager.h" #include "nsICharsetAlias.h" #include "nsFileSpec.h" static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); const char* kBadHTMLText="

Oops...

You just tried to read a non-existent document:
"; const char* kUnorderedStringError = "String argument must be ordered. Don't you read API's?"; #ifdef __INCREMENTAL const int kBufsize=1; #else const int kBufsize=64; #endif MOZ_DECL_CTOR_COUNTER(nsScanner); /** * Use this constructor if you want i/o to be based on * a single string you hand in during construction. * This short cut was added for Javascript. * * @update gess 5/12/98 * @param aMode represents the parser mode (nav, other) * @return */ nsScanner::nsScanner(nsString& anHTMLString, const nsString& aCharset, nsCharsetSource aSource) : mBuffer(anHTMLString), mFilename(""), mUnicodeXferBuf("") { MOZ_COUNT_CTOR(nsScanner); mTotalRead=mBuffer.Length(); mIncremental=PR_FALSE; mOwnsStream=PR_FALSE; mOffset=0; mMarkPos=0; mInputStream=0; mUnicodeDecoder = 0; mCharset = ""; mCharsetSource = kCharsetUninitialized; SetDocumentCharset(aCharset, aSource); mNewlinesSkipped=0; } /** * Use this constructor if you want i/o to be based on strings * the scanner receives. If you pass a null filename, you * can still provide data to the scanner via append. * * @update gess 5/12/98 * @param aFilename -- * @return */ nsScanner::nsScanner(nsString& aFilename,PRBool aCreateStream, const nsString& aCharset, nsCharsetSource aSource) : mBuffer(""), mFilename(aFilename), mUnicodeXferBuf("") { MOZ_COUNT_CTOR(nsScanner); mIncremental=PR_TRUE; mOffset=0; mMarkPos=0; mTotalRead=0; mOwnsStream=aCreateStream; mInputStream=0; if(aCreateStream) { mInputStream = new nsInputFileStream(nsFileSpec(aFilename)); } //if mUnicodeDecoder = 0; mCharset = ""; mCharsetSource = kCharsetUninitialized; SetDocumentCharset(aCharset, aSource); mNewlinesSkipped=0; } /** * Use this constructor if you want i/o to be stream based. * * @update gess 5/12/98 * @param aStream -- * @param assumeOwnership -- * @param aFilename -- * @return */ nsScanner::nsScanner(nsString& aFilename,nsInputStream& aStream,const nsString& aCharset, nsCharsetSource aSource) : mBuffer(""), mFilename(aFilename) , mUnicodeXferBuf("") { MOZ_COUNT_CTOR(nsScanner); mIncremental=PR_FALSE; mOffset=0; mMarkPos=0; mTotalRead=0; mOwnsStream=PR_FALSE; mInputStream=&aStream; mUnicodeDecoder = 0; mCharset = ""; mCharsetSource = kCharsetUninitialized; SetDocumentCharset(aCharset, aSource); mNewlinesSkipped=0; } nsresult nsScanner::SetDocumentCharset(const nsString& aCharset , nsCharsetSource aSource) { nsresult res = NS_OK; if( aSource < mCharsetSource) // priority is lower the the current one , just return res; NS_WITH_SERVICE(nsICharsetAlias, calias, kCharsetAliasCID, &res); NS_ASSERTION( nsnull != calias, "cannot find charset alias"); nsAutoString charsetName = aCharset; if( NS_SUCCEEDED(res) && (nsnull != calias)) { PRBool same = PR_FALSE; res = calias->Equals(aCharset, mCharset, &same); if(NS_SUCCEEDED(res) && same) { return NS_OK; // no difference, don't change it } // different, need to change it res = calias->GetPreferred(aCharset, charsetName); if(NS_FAILED(res) && (kCharsetUninitialized == mCharsetSource) ) { // failed - unknown alias , fallback to ISO-8859-1 charsetName = "ISO-8859-1"; } mCharset = charsetName; mCharsetSource = aSource; NS_WITH_SERVICE(nsICharsetConverterManager, ccm, kCharsetConverterManagerCID, &res); if(NS_SUCCEEDED(res) && (nsnull != ccm)) { nsIUnicodeDecoder * decoder = nsnull; res = ccm->GetUnicodeDecoder(&mCharset, &decoder); if(NS_SUCCEEDED(res) && (nsnull != decoder)) { NS_IF_RELEASE(mUnicodeDecoder); mUnicodeDecoder = decoder; } } } return res; } /** * default destructor * * @update gess 3/25/98 * @param * @return */ nsScanner::~nsScanner() { MOZ_COUNT_DTOR(nsScanner); if(mInputStream) { mInputStream->close(); if(mOwnsStream) delete mInputStream; } mInputStream=0; NS_IF_RELEASE(mUnicodeDecoder); } /** * Resets current offset position of input stream to marked position. * This allows us to back up to this point if the need should arise, * such as when tokenization gets interrupted. * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! * * @update gess 5/12/98 * @param * @return */ PRUint32 nsScanner::RewindToMark(void){ mOffset=mMarkPos; return mOffset; } /** * Records current offset position in input stream. This allows us * to back up to this point if the need should arise, such as when * tokenization gets interrupted. * * @update gess 7/29/98 * @param * @return */ PRUint32 nsScanner::Mark(PRInt32 anIndex){ if(kNotFound==anIndex) { if((mOffset>0) && (mOffset>eBufferSizeThreshold)) { mBuffer.Cut(0,mOffset); //delete chars up to mark position mOffset=0; } mMarkPos=mOffset; } else mOffset=(PRUint32)anIndex; return 0; } /** * Insert data to our underlying input buffer as * if it were read from an input stream. * * @update harishd 01/12/99 * @return error code */ PRBool nsScanner::Insert(const nsString& aBuffer) { PRInt32 theLen=aBuffer.Length(); mBuffer.Insert(aBuffer,mOffset,theLen); mTotalRead+=theLen; return PR_TRUE; } /** * Append data to our underlying input buffer as * if it were read from an input stream. * * @update gess4/3/98 * @return error code */ PRBool nsScanner::Append(const nsString& aBuffer) { PRInt32 theLen=mBuffer.Length(); mBuffer.Append(aBuffer); mTotalRead+=aBuffer.Length(); if(theLenGetMaxLength(aBuffer, aLen, &unicharBufLen); mUnicodeXferBuf.SetCapacity(unicharBufLen+32); mUnicodeXferBuf.Truncate(); PRUnichar *unichars = (PRUnichar*)mUnicodeXferBuf.GetUnicode(); nsresult res; do { PRInt32 srcLength = aLen; PRInt32 unicharLength = unicharBufLen; res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength); unichars[unicharLength]=0; //add this since the unicode converters can't be trusted to do so. mBuffer.Append(unichars, unicharLength); mTotalRead += unicharLength; // if we failed, we consume one byte by replace it with U+FFFD // and try conversion again. if(NS_FAILED(res)) { mUnicodeDecoder->Reset(); mBuffer.Append( (PRUnichar)0xFFFD); mTotalRead++; if(((PRUint32) (srcLength + 1)) > aLen) srcLength = aLen; else srcLength++; aBuffer += srcLength; aLen -= srcLength; } } while (NS_FAILED(res) && (aLen > 0)); // we continue convert the bytes data into Unicode // if we have conversion error and we have more data. // delete[] unichars; } else { mBuffer.Append(aBuffer,aLen); mTotalRead+=aLen; } if(theLenread(buf, kBufsize); if (0 == numread) { return kEOF; } } mOffset=mBuffer.Length(); if((0=(PRUint32)mBuffer.Length()) { theError=FillBuffer(); } if(NS_OK==theError) { if (0==(PRUint32)mBuffer.Length()) { return kEOF; } } return theError; } /** * retrieve next char from scanners internal input stream * * @update gess 3/25/98 * @param * @return error code reflecting read status */ nsresult nsScanner::GetChar(PRUnichar& aChar) { nsresult result=NS_OK; aChar=0; if(mOffset>=(PRUint32)mBuffer.Length()) result=Eof(); if(NS_OK == result){ aChar=GetCharAt(mBuffer,mOffset++); } return result; } /** * peek ahead to consume next char from scanner's internal * input buffer * * @update gess 3/25/98 * @param * @return */ nsresult nsScanner::Peek(PRUnichar& aChar) { nsresult result=NS_OK; aChar=0; if(mOffset>=(PRUint32)mBuffer.Length()) result=Eof(); if(NS_OK == result){ aChar=GetCharAt(mBuffer,mOffset); } return result; } /** * Push the given char back onto the scanner * * @update gess 3/25/98 * @param * @return error code */ nsresult nsScanner::PutBack(PRUnichar aChar) { if(mOffset>0) mOffset--; else mBuffer.Insert(aChar,0); return NS_OK; } /** * Skip whitespace on scanner input stream * * @update gess 3/25/98 * @param * @return error status */ nsresult nsScanner::SkipWhitespace(void) { PRUnichar theChar=0; nsresult result=Peek(theChar); const PRUnichar* theBuf=mBuffer.GetUnicode(); PRInt32 theOrigin=mOffset; PRBool found=PR_FALSE; mNewlinesSkipped = 0; while(NS_OK==result) { theChar=theBuf[mOffset++]; if(theChar) { switch(theChar) { case '\n': mNewlinesSkipped++; case ' ' : case '\r': case '\b': case '\t': found=PR_TRUE; break; default: found=PR_FALSE; break; } if(!found) { mOffset-=1; break; } } else if ((PRUint32)mBuffer.Length()<=mOffset) { mOffset-=1; result=Peek(theChar); theBuf=mBuffer.GetUnicode(); theOrigin=mOffset; } } //DoErrTest(aString); return result; } /** * Skip over chars as long as they equal given char * * @update gess 3/25/98 * @param * @return error code */ nsresult nsScanner::SkipOver(PRUnichar aSkipChar){ PRUnichar ch=0; nsresult result=NS_OK; while(NS_OK==result) { result=GetChar(ch); if(NS_OK == result) { if(ch!=aSkipChar) { PutBack(ch); break; } } else break; } //while return result; } /** * Skip over chars as long as they're in aSkipSet * * @update gess 3/25/98 * @param aSkipSet is an ordered string. * @return error code */ nsresult nsScanner::SkipOver(nsString& aSkipSet){ PRUnichar theChar=0; nsresult result=NS_OK; while(NS_OK==result) { result=GetChar(theChar); if(NS_OK == result) { PRInt32 pos=aSkipSet.FindChar(theChar); if(kNotFound==pos) { PutBack(theChar); break; } } else break; } //while return result; } /** * Skip over chars until they're in aValidSet * * @update gess 3/25/98 * @param aValid set is an ordered string that * contains chars you're looking for * @return error code */ nsresult nsScanner::SkipTo(nsString& aValidSet){ PRUnichar ch=0; nsresult result=NS_OK; while(NS_OK==result) { result=GetChar(ch); if(NS_OK == result) { PRInt32 pos=aValidSet.FindChar(ch); if(kNotFound!=pos) { PutBack(ch); break; } } else break; } //while return result; } #if 0 void DoErrTest(nsString& aString) { PRInt32 pos=aString.FindChar(0); if(kNotFoundin the * given validSet of input chars. * * @update gess 3/25/98 * @param aString will contain the result of this method * @param aValidSet is an ordered string that contains the * valid characters * @return error code */ nsresult nsScanner::ReadWhile(nsString& aString, nsString& aValidSet, PRBool anOrderedSet, PRBool addTerminal){ NS_ASSERTION(((PR_FALSE==anOrderedSet) || aValidSet.IsOrdered()),kUnorderedStringError); PRUnichar theChar=0; nsresult result=Peek(theChar); const PRUnichar* theBuf=mBuffer.GetUnicode(); PRInt32 theOrigin=mOffset; while(NS_OK==result) { theChar=theBuf[mOffset++]; if(theChar) { PRInt32 pos=(anOrderedSet) ? aValidSet.BinarySearch(theChar) : aValidSet.FindChar(theChar); if(kNotFound==pos) { if(!addTerminal) mOffset-=1; aString.Append(&theBuf[theOrigin],mOffset-theOrigin); break; } } else if ((PRUint32)mBuffer.Length()<=mOffset) { mOffset -= 1; aString.Append(&theBuf[theOrigin],mOffset-theOrigin); result=Peek(theChar); theBuf=mBuffer.GetUnicode(); theOrigin=mOffset; } } //DoErrTest(aString); return result; } /** * Consume chars as long as they are in the * given validSet of input chars. * * @update gess 3/25/98 * @param aString will contain the result of this method * @param aValidSet is an ordered string that contains the * valid characters * @return error code */ nsresult nsScanner::ReadWhile(nsString& aString, nsCString& aValidSet, PRBool anOrderedSet, PRBool addTerminal){ NS_ASSERTION(((PR_FALSE==anOrderedSet) || aValidSet.IsOrdered()),kUnorderedStringError); PRUnichar theChar=0; nsresult result=Peek(theChar); const PRUnichar* theBuf=mBuffer.GetUnicode(); PRInt32 theOrigin=mOffset; while(NS_OK==result) { theChar=theBuf[mOffset++]; if(theChar) { PRInt32 pos=(anOrderedSet) ? aValidSet.BinarySearch(theChar) : aValidSet.FindChar(theChar); if(kNotFound==pos) { if(!addTerminal) mOffset-=1; aString.Append(&theBuf[theOrigin],mOffset-theOrigin); break; } } else if ((PRUint32)mBuffer.Length()<=mOffset) { mOffset -= 1; aString.Append(&theBuf[theOrigin],mOffset-theOrigin); result=Peek(theChar); theBuf=mBuffer.GetUnicode(); theOrigin=mOffset; } } //DoErrTest(aString); return result; } /** * Consume chars as long as they are in the * given validSet of input chars. * * @update gess 3/25/98 * @param aString will contain the result of this method * @param anInputSet contains the legal input chars * valid characters * @return error code */ nsresult nsScanner::ReadWhile(nsString& aString, const char* anInputSet, PRBool anOrderedSet, PRBool addTerminal) { nsresult result=NS_OK; if(anInputSet) { PRInt32 len=nsCRT::strlen(anInputSet); if(0