1998-04-14 00:24:54 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/*
|
|
|
|
* The contents of this file are subject to the Netscape Public License
|
|
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
|
|
* http://www.mozilla.org/NPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* NPL.
|
|
|
|
*
|
|
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
|
|
* Communications Corporation. Portions created by Netscape are
|
|
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
|
|
* Reserved.
|
|
|
|
*/
|
1998-05-22 00:38:32 +04:00
|
|
|
|
1998-04-14 00:24:54 +04:00
|
|
|
/**
|
|
|
|
* MODULE NOTES:
|
|
|
|
* @update gess 4/1/98
|
|
|
|
*
|
|
|
|
* This class does two primary jobs:
|
|
|
|
* 1) It iterates the tokens provided during the
|
|
|
|
* tokenization process, identifing where elements
|
|
|
|
* begin and end (doing validation and normalization).
|
|
|
|
* 2) It controls and coordinates with an instance of
|
|
|
|
* the IContentSink interface, to coordinate the
|
|
|
|
* the production of the content model.
|
|
|
|
*
|
|
|
|
* The basic operation of this class assumes that an HTML
|
|
|
|
* document is non-normalized. Therefore, we don't process
|
|
|
|
* the document in a normalized way. Don't bother to look
|
|
|
|
* for methods like: doHead() or doBody().
|
|
|
|
*
|
|
|
|
* Instead, in order to be backward compatible, we must
|
|
|
|
* scan the set of tokens and perform this basic set of
|
|
|
|
* operations:
|
|
|
|
* 1) Determine the token type (easy, since the tokens know)
|
|
|
|
* 2) Determine the appropriate section of the HTML document
|
|
|
|
* each token belongs in (HTML,HEAD,BODY,FRAMESET).
|
|
|
|
* 3) Insert content into our document (via the sink) into
|
|
|
|
* the correct section.
|
|
|
|
* 4) In the case of tags that belong in the BODY, we must
|
|
|
|
* ensure that our underlying document state reflects
|
|
|
|
* the appropriate context for our tag.
|
|
|
|
*
|
|
|
|
* For example,if we see a <TR>, we must ensure our
|
|
|
|
* document contains a table into which the row can
|
|
|
|
* be placed. This may result in "implicit containers"
|
|
|
|
* created to ensure a well-formed document.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
1998-06-19 05:00:27 +04:00
|
|
|
#ifndef NS_PARSER__
|
|
|
|
#define NS_PARSER__
|
1998-04-14 00:24:54 +04:00
|
|
|
|
|
|
|
#include "nsIParser.h"
|
|
|
|
#include "nsDeque.h"
|
|
|
|
#include "nsParserNode.h"
|
1998-04-22 22:32:49 +04:00
|
|
|
#include "nsParserTypes.h"
|
1998-05-15 02:19:08 +04:00
|
|
|
#include "nsIURL.h"
|
1998-07-14 01:13:09 +04:00
|
|
|
#include "CParserContext.h"
|
1998-04-14 00:24:54 +04:00
|
|
|
|
1998-06-19 05:00:27 +04:00
|
|
|
#define NS_PARSER_IID \
|
1998-04-14 00:24:54 +04:00
|
|
|
{0x2ce606b0, 0xbee6, 0x11d1, \
|
|
|
|
{0xaa, 0xd9, 0x00, 0x80, 0x5f, 0x8a, 0x3e, 0x14}}
|
|
|
|
|
|
|
|
|
|
|
|
class IContentSink;
|
|
|
|
class nsIHTMLContentSink;
|
1998-04-23 03:56:57 +04:00
|
|
|
class nsIDTD;
|
1998-06-25 05:42:50 +04:00
|
|
|
class nsIDTDDebug;
|
1998-06-18 03:13:28 +04:00
|
|
|
class CScanner;
|
1998-06-19 02:57:25 +04:00
|
|
|
class nsIParserFilter;
|
1998-07-10 09:35:23 +04:00
|
|
|
class fstream;
|
1998-04-14 00:24:54 +04:00
|
|
|
|
1998-06-19 05:00:27 +04:00
|
|
|
class nsParser : public nsIParser, public nsIStreamListener {
|
1998-04-14 00:24:54 +04:00
|
|
|
|
1998-04-25 23:45:14 +04:00
|
|
|
public:
|
1998-04-22 22:32:49 +04:00
|
|
|
friend class CTokenHandler;
|
1998-04-14 00:24:54 +04:00
|
|
|
|
|
|
|
NS_DECL_ISUPPORTS
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* default constructor
|
|
|
|
* @update gess5/11/98
|
|
|
|
*/
|
1998-06-19 05:00:27 +04:00
|
|
|
nsParser();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor
|
|
|
|
* @update gess5/11/98
|
|
|
|
*/
|
1998-06-19 05:00:27 +04:00
|
|
|
~nsParser();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Select given content sink into parser for parser output
|
|
|
|
* @update gess5/11/98
|
|
|
|
* @param aSink is the new sink to be used by parser
|
|
|
|
* @return old sink, or NULL
|
|
|
|
*/
|
|
|
|
virtual nsIContentSink* SetContentSink(nsIContentSink* aSink);
|
1998-06-18 03:13:28 +04:00
|
|
|
|
1998-06-19 05:00:27 +04:00
|
|
|
virtual nsIParserFilter* SetParserFilter(nsIParserFilter* aFilter);
|
1998-06-18 03:13:28 +04:00
|
|
|
|
1998-07-02 12:14:22 +04:00
|
|
|
virtual void RegisterDTD(nsIDTD* aDTD);
|
|
|
|
|
1998-06-18 03:13:28 +04:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* @update gess 6/9/98
|
|
|
|
* @param
|
|
|
|
* @return
|
|
|
|
*/
|
|
|
|
virtual CScanner* GetScanner(void);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-07-14 01:13:09 +04:00
|
|
|
* Cause parser to parse input from given URL
|
1998-05-12 04:59:32 +04:00
|
|
|
* @update gess5/11/98
|
|
|
|
* @param aURL is a descriptor for source document
|
1998-06-01 23:51:52 +04:00
|
|
|
* @param aListener is a listener to forward notifications to
|
1998-05-12 04:59:32 +04:00
|
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
|
|
*/
|
1998-06-01 23:51:52 +04:00
|
|
|
virtual PRInt32 Parse(nsIURL* aURL,
|
1998-07-14 01:13:09 +04:00
|
|
|
nsIStreamObserver* aListener,
|
|
|
|
nsIDTDDebug* aDTDDebug = 0);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cause parser to parse input from given nsIInputStream
|
|
|
|
* @update gess5/11/98
|
|
|
|
* @param pIStream is an nsIInputStream
|
|
|
|
* @param aListener is a listener to forward notifications to
|
|
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
|
|
*/
|
|
|
|
virtual PRInt32 Parse(nsIInputStream* pIStream,
|
|
|
|
nsIStreamObserver* aListener,
|
|
|
|
nsIDTDDebug* aDTDDebug = 0);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-05-15 02:19:08 +04:00
|
|
|
* Cause parser to parse input from given file in given mode
|
1998-05-12 04:59:32 +04:00
|
|
|
* @update gess5/11/98
|
1998-05-15 02:19:08 +04:00
|
|
|
* @param aFilename is a path for file document
|
1998-05-12 04:59:32 +04:00
|
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
virtual PRInt32 Parse(nsString& aFilename);
|
1998-05-15 02:19:08 +04:00
|
|
|
|
1998-07-10 09:35:23 +04:00
|
|
|
/**
|
|
|
|
* Cause parser to parse input from given stream
|
|
|
|
* @update gess5/11/98
|
|
|
|
* @param aStream is the i/o source
|
|
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
|
|
*/
|
|
|
|
virtual PRInt32 Parse(fstream& aStream);
|
|
|
|
|
1998-05-15 02:19:08 +04:00
|
|
|
/**
|
|
|
|
* @update gess5/11/98
|
|
|
|
* @param anHTMLString contains a string-full of real HTML
|
|
|
|
* @param appendTokens tells us whether we should insert tokens inline, or append them.
|
|
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
|
|
*/
|
|
|
|
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This method gets called (automatically) during incremental parsing
|
|
|
|
* @update gess5/11/98
|
|
|
|
* @return TRUE if all went well, otherwise FALSE
|
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
virtual PRInt32 ResumeParse();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* Causes the parser to scan foward, collecting nearby (sequential)
|
|
|
|
* attribute tokens into the given node.
|
1998-05-12 04:59:32 +04:00
|
|
|
* @update gess5/11/98
|
1998-06-18 03:13:28 +04:00
|
|
|
* @param node to store attributes
|
|
|
|
* @return number of attributes added to node.
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-06-18 03:13:28 +04:00
|
|
|
virtual PRInt32 CollectAttributes(nsCParserNode& aNode,PRInt32 aCount);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* Causes the next skipped-content token (if any) to
|
|
|
|
* be consumed by this node.
|
1998-05-12 04:59:32 +04:00
|
|
|
* @update gess5/11/98
|
1998-06-18 03:13:28 +04:00
|
|
|
* @param node to consume skipped-content
|
1998-06-23 04:53:50 +04:00
|
|
|
* @param holds the number of skipped content elements encountered
|
|
|
|
* @return Error condition.
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-06-23 04:53:50 +04:00
|
|
|
virtual PRInt32 CollectSkippedContent(nsCParserNode& aNode,PRInt32& aCount);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* This debug routine is used to cause the tokenizer to
|
|
|
|
* iterate its token list, asking each token to dump its
|
|
|
|
* contents to the given output stream.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @param
|
|
|
|
* @return
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-06-18 03:13:28 +04:00
|
|
|
void DebugDumpSource(ostream& out);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
|
1998-06-18 03:13:28 +04:00
|
|
|
//*********************************************
|
1998-05-15 02:19:08 +04:00
|
|
|
// These methods are callback methods used by
|
|
|
|
// net lib to let us know about our inputstream.
|
|
|
|
//*********************************************
|
|
|
|
NS_IMETHOD GetBindInfo(void);
|
1998-06-04 09:09:32 +04:00
|
|
|
NS_IMETHOD OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const nsString& aMmsg);
|
|
|
|
NS_IMETHOD OnStartBinding(const char *aContentType);
|
1998-05-15 02:19:08 +04:00
|
|
|
NS_IMETHOD OnDataAvailable(nsIInputStream *pIStream, PRInt32 length);
|
1998-06-04 09:09:32 +04:00
|
|
|
NS_IMETHOD OnStopBinding(PRInt32 status, const nsString& aMsg);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
/**
|
1998-05-22 00:38:32 +04:00
|
|
|
*
|
|
|
|
* @update gess5/18/98
|
|
|
|
* @param
|
|
|
|
* @return
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
PRInt32 WillBuildModel(nsString& aFilename);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-05-22 00:38:32 +04:00
|
|
|
*
|
|
|
|
* @update gess5/18/98
|
|
|
|
* @param
|
|
|
|
* @return
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-05-22 00:38:32 +04:00
|
|
|
PRInt32 DidBuildModel(PRInt32 anErrorCode);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-05-22 00:38:32 +04:00
|
|
|
* This method gets called when the tokens have been consumed, and it's time
|
|
|
|
* to build the model via the content sink.
|
1998-05-12 04:59:32 +04:00
|
|
|
* @update gess5/11/98
|
1998-05-22 00:38:32 +04:00
|
|
|
* @return YES if model building went well -- NO otherwise.
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
virtual PRInt32 BuildModel(void);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
1998-06-18 03:13:28 +04:00
|
|
|
private:
|
1998-05-12 04:59:32 +04:00
|
|
|
|
1998-06-18 03:13:28 +04:00
|
|
|
/*******************************************
|
|
|
|
These are the tokenization methods...
|
|
|
|
*******************************************/
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* Cause the tokenizer to consume the next token, and
|
|
|
|
* return an error result.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @param anError -- ref to error code
|
|
|
|
* @return new token or null
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-06-18 03:13:28 +04:00
|
|
|
virtual PRInt32 ConsumeToken(CToken*& aToken);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* Part of the code sandwich, this gets called right before
|
|
|
|
* the tokenization process begins. The main reason for
|
|
|
|
* this call is to allow the delegate to do initialization.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @param
|
|
|
|
* @return TRUE if it's ok to proceed
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
PRBool WillTokenize();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
1998-07-14 01:13:09 +04:00
|
|
|
|
1998-05-12 04:59:32 +04:00
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* This is the primary control routine. It iteratively
|
|
|
|
* consumes tokens until an error occurs or you run out
|
|
|
|
* of data.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @return error code
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
PRInt32 Tokenize();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* This is the tail-end of the code sandwich for the
|
|
|
|
* tokenization process. It gets called once tokenziation
|
|
|
|
* has completed.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @param
|
|
|
|
* @return TRUE if all went well
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-07-14 01:13:09 +04:00
|
|
|
PRBool DidTokenize();
|
1998-05-12 04:59:32 +04:00
|
|
|
|
|
|
|
/**
|
1998-06-18 03:13:28 +04:00
|
|
|
* This debug routine is used to cause the tokenizer to
|
|
|
|
* iterate its token list, asking each token to dump its
|
|
|
|
* contents to the given output stream.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
* @param
|
|
|
|
* @return
|
1998-05-12 04:59:32 +04:00
|
|
|
*/
|
1998-06-18 03:13:28 +04:00
|
|
|
void DebugDumpTokens(ostream& out);
|
1998-05-12 04:59:32 +04:00
|
|
|
|
1998-06-23 04:53:50 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This method is used as a backstop to compute the kind of content
|
|
|
|
* that is contained in the scanner stream. This method is important
|
|
|
|
* because it allows us to defer the resolution of our DTD (and hence)
|
|
|
|
* filters and maybe eventually sinks based on the input type.
|
|
|
|
*
|
|
|
|
* @update gess6/22/98
|
|
|
|
* @param
|
|
|
|
* @return TRUE if we figured it out.
|
|
|
|
*/
|
1998-07-10 09:35:23 +04:00
|
|
|
eAutoDetectResult AutoDetectContentType(nsString& aBuffer,nsString& aType);
|
1998-06-23 04:53:50 +04:00
|
|
|
|
1998-05-15 02:19:08 +04:00
|
|
|
|
|
|
|
protected:
|
1998-05-12 04:59:32 +04:00
|
|
|
//*********************************************
|
|
|
|
// And now, some data members...
|
|
|
|
//*********************************************
|
|
|
|
|
1998-07-14 01:13:09 +04:00
|
|
|
/*****************************************************
|
|
|
|
All of these moved into the parse-context object:
|
|
|
|
|
|
|
|
PRInt32 mMajorIteration;
|
|
|
|
PRInt32 mMinorIteration;
|
|
|
|
|
|
|
|
nsIURL* mURL;
|
|
|
|
nsString mSourceType;
|
|
|
|
nsString mTargetType;
|
|
|
|
eAutoDetectResult mAutoDetectStatus;
|
1998-05-22 00:38:32 +04:00
|
|
|
|
1998-05-12 04:59:32 +04:00
|
|
|
nsDequeIterator* mCurrentPos;
|
1998-05-22 00:38:32 +04:00
|
|
|
nsDequeIterator* mMarkPos;
|
1998-07-14 01:13:09 +04:00
|
|
|
nsDeque mTokenDeque;
|
|
|
|
CScanner* mScanner;
|
1998-05-12 04:59:32 +04:00
|
|
|
nsIDTD* mDTD;
|
1998-07-14 01:13:09 +04:00
|
|
|
|
1998-05-12 04:59:32 +04:00
|
|
|
eParseMode mParseMode;
|
1998-05-22 00:38:32 +04:00
|
|
|
char* mTransferBuffer;
|
1998-07-14 01:13:09 +04:00
|
|
|
*****************************************************/
|
|
|
|
|
|
|
|
CParserContext* mParserContext;
|
|
|
|
|
|
|
|
/*****************************************************
|
|
|
|
The above fields are moving into parse-context
|
|
|
|
*****************************************************/
|
|
|
|
|
|
|
|
|
|
|
|
nsIStreamObserver* mObserver;
|
|
|
|
nsIContentSink* mSink;
|
|
|
|
nsIParserFilter* mParserFilter;
|
1998-06-18 03:13:28 +04:00
|
|
|
|
|
|
|
|
1998-07-02 12:14:22 +04:00
|
|
|
nsIDTDDebug* mDTDDebug;
|
1998-07-14 01:13:09 +04:00
|
|
|
|
|
|
|
|
1998-04-14 00:24:54 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|