From 99ea5afe50ac5230454e675ccb137aaf2c73f062 Mon Sep 17 00:00:00 2001 From: "kostello%netscape.com" Date: Thu, 4 Mar 1999 21:52:57 +0000 Subject: [PATCH] Added new sink that outputs a Text stream instead of an HTML stream. --- htmlparser/src/MANIFEST | 1 + htmlparser/src/Makefile.in | 2 + htmlparser/src/makefile.win | 3 + htmlparser/src/nsHTMLContentSinkStream.cpp | 163 ++++- htmlparser/src/nsHTMLToTXTSinkStream.cpp | 626 ++++++++++++++++++ htmlparser/src/nsHTMLToTXTSinkStream.h | 132 ++++ parser/htmlparser/src/MANIFEST | 1 + parser/htmlparser/src/Makefile.in | 2 + parser/htmlparser/src/makefile.win | 3 + .../src/nsHTMLContentSinkStream.cpp | 163 ++++- .../htmlparser/src/nsHTMLToTXTSinkStream.cpp | 626 ++++++++++++++++++ parser/htmlparser/src/nsHTMLToTXTSinkStream.h | 132 ++++ 12 files changed, 1818 insertions(+), 36 deletions(-) create mode 100644 htmlparser/src/nsHTMLToTXTSinkStream.cpp create mode 100644 htmlparser/src/nsHTMLToTXTSinkStream.h create mode 100644 parser/htmlparser/src/nsHTMLToTXTSinkStream.cpp create mode 100644 parser/htmlparser/src/nsHTMLToTXTSinkStream.h diff --git a/htmlparser/src/MANIFEST b/htmlparser/src/MANIFEST index 1509c6a29a8..c327b9697e8 100644 --- a/htmlparser/src/MANIFEST +++ b/htmlparser/src/MANIFEST @@ -6,6 +6,7 @@ nshtmlpars.h nsIContentSink.h nsIHTMLContentSink.h nsHTMLContentSinkStream.h +nsHTMLToTXTSinkStream.h nsITokenizer.h nsHTMLTokens.h nsIParserNode.h diff --git a/htmlparser/src/Makefile.in b/htmlparser/src/Makefile.in index 1a41798853e..fed1f858d58 100644 --- a/htmlparser/src/Makefile.in +++ b/htmlparser/src/Makefile.in @@ -50,6 +50,7 @@ CPPSRCS= \ nsToken.cpp \ nsTokenHandler.cpp \ nsHTMLContentSinkStream.cpp \ + nsHTMLToTXTSinkStream.cpp \ nsValidDTD.cpp \ nsWellFormedDTD.cpp \ nsViewSourceHTML.cpp \ @@ -64,6 +65,7 @@ EXPORTS = \ nsIExpatTokenizer.h \ nsIHTMLContentSink.h \ nsHTMLContentSinkStream.h \ + nsHTMLToTXTSinkStream.h \ nsHTMLEntities.h \ nsHTMLTokens.h \ nsILoggingSink.h \ diff --git a/htmlparser/src/makefile.win b/htmlparser/src/makefile.win index 24972b41bdd..0f9ad6896ab 100644 --- a/htmlparser/src/makefile.win +++ b/htmlparser/src/makefile.win @@ -46,6 +46,7 @@ CPPSRCS= \ nsToken.cpp \ nsTokenHandler.cpp \ nsHTMLContentSinkStream.cpp \ + nsHTMLToTXTSinkStream.cpp \ nsValidDTD.cpp \ nsWellFormedDTD.cpp \ nsViewSourceHTML.cpp \ @@ -77,6 +78,7 @@ CPP_OBJS= \ .\$(OBJDIR)\nsToken.obj \ .\$(OBJDIR)\nsTokenHandler.obj \ .\$(OBJDIR)\nsHTMLContentSinkStream.obj \ + .\$(OBJDIR)\nsHTMLToTXTSinkStream.obj \ .\$(OBJDIR)\nsValidDTD.obj \ .\$(OBJDIR)\nsWellFormedDTD.obj \ .\$(OBJDIR)\nsViewSourceHTML.obj \ @@ -93,6 +95,7 @@ EXPORTS= \ nsIHTMLContentSink.h \ nsILoggingSink.h \ nsHTMLContentSinkStream.h \ + nsHTMLToTXTSinkStream.h \ nsHTMLEntities.h \ nsHTMLTokens.h \ nsIParserNode.h \ diff --git a/htmlparser/src/nsHTMLContentSinkStream.cpp b/htmlparser/src/nsHTMLContentSinkStream.cpp index 509bf3caf0e..d8804f8dff1 100644 --- a/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -41,28 +41,154 @@ static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); static char* gHeaderComment = ""; static char* gDocTypeHeader = ""; const int gTabSize=2; -static char gBuffer[500]; - - +static char gBuffer[1024]; /** PRETTY PRINTING PROTOTYPES **/ + +class nsTagFormat +{ +public: + void Init(PRBool aBefore, PRBool aStart, PRBool aEnd, PRBool aAfter); + void SetIndentGroup(PRUint8 aGroup); + void SetFormat(PRBool aOnOff); + +public: + PRBool mBreakBefore; + PRBool mBreakStart; + PRBool mBreakEnd; + PRBool mBreakAfter; + + PRUint8 mIndentGroup; // zero for none + PRBool mFormat; // format (on|off) +}; + +void nsTagFormat::Init(PRBool aBefore, PRBool aStart, PRBool aEnd, PRBool aAfter) +{ + mBreakBefore = aBefore; + mBreakStart = aStart; + mBreakEnd = aEnd; + mBreakAfter = aAfter; + mFormat = PR_TRUE; +} + +void nsTagFormat::SetIndentGroup(PRUint8 aGroup) +{ + mIndentGroup = aGroup; +} + +void nsTagFormat::SetFormat(PRBool aOnOff) +{ + mFormat = aOnOff; +} + +class nsPrettyPrinter +{ +public: + + void Init(PRBool aIndentEnable = PR_TRUE, PRUint8 aColSize = 2, PRUint8 aTabSize = 8, PRBool aUseTabs = PR_FALSE ); + + PRBool mIndentEnable; + PRUint8 mIndentColSize; + PRUint8 mIndentTabSize; + PRBool mIndentUseTabs; + + PRBool mAutowrapEnable; + PRUint32 mAutoWrapColWidth; + nsString mBreak; // CRLF, CR, LF + + nsTagFormat mTagFormat[NS_HTML_TAG_MAX+1]; +}; + + +void nsPrettyPrinter::Init(PRBool aIndentEnable, PRUint8 aColSize, PRUint8 aTabSize, PRBool aUseTabs) +{ + mIndentEnable = aIndentEnable; + mIndentColSize = aColSize; + mIndentTabSize = aTabSize; + mIndentUseTabs = aUseTabs; + + mAutowrapEnable = PR_TRUE; + mAutoWrapColWidth = 72; + mBreak = "\n"; // CRLF, CR, LF + + for (PRUint32 i = 0; i < NS_HTML_TAG_MAX; i++) + mTagFormat[i].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + + mTagFormat[eHTMLTag_a].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_abbr].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + mTagFormat[eHTMLTag_applet].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_area].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_b].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + mTagFormat[eHTMLTag_base].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_blockquote].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_body].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_br].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_caption].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_center].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_dd].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_dir].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_div].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_dl].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_dt].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_embed].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_form].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_frame].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_frameset].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h1].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h2].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h3].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h4].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h5].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h6].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_head].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_hr].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_html].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_ilayer].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_input].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_isindex].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_layer].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_li].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_link].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_map].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_menu].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_meta].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_object].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_ol].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_option].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_p].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_param].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_pre].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_script].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_select].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_style].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_table].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_td].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_textarea].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_th].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_title].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_tr].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_ul].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); +} + + + -PRBool IsInline(eHTMLTags aTag); -PRBool IsBlockLevel(eHTMLTags aTag); -PRInt32 BreakBeforeOpen(eHTMLTags aTag); -PRInt32 BreakAfterOpen(eHTMLTags aTag); -PRInt32 BreakBeforeClose(eHTMLTags aTag); -PRInt32 BreakAfterClose(eHTMLTags aTag); -PRBool IndentChildren(eHTMLTags aTag); -PRBool PreformattedChildren(eHTMLTags aTag); -PRBool EatOpen(eHTMLTags aTag); -PRBool EatClose(eHTMLTags aTag); -PRBool PermitWSBeforeOpen(eHTMLTags aTag); -PRBool PermitWSAfterOpen(eHTMLTags aTag); -PRBool PermitWSBeforeClose(eHTMLTags aTag); -PRBool PermitWSAfterClose(eHTMLTags aTag); -PRBool IgnoreWS(eHTMLTags aTag); +static PRBool IsInline(eHTMLTags aTag); +static PRBool IsBlockLevel(eHTMLTags aTag); +static PRInt32 BreakBeforeOpen(eHTMLTags aTag); +static PRInt32 BreakAfterOpen(eHTMLTags aTag); +static PRInt32 BreakBeforeClose(eHTMLTags aTag); +static PRInt32 BreakAfterClose(eHTMLTags aTag); +static PRBool IndentChildren(eHTMLTags aTag); +static PRBool PreformattedChildren(eHTMLTags aTag); +static PRBool EatOpen(eHTMLTags aTag); +static PRBool EatClose(eHTMLTags aTag); +static PRBool PermitWSBeforeOpen(eHTMLTags aTag); +static PRBool PermitWSAfterOpen(eHTMLTags aTag); +static PRBool PermitWSBeforeClose(eHTMLTags aTag); +static PRBool PermitWSAfterClose(eHTMLTags aTag); +static PRBool IgnoreWS(eHTMLTags aTag); @@ -150,6 +276,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead * @return */ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoFormat,PRBool aDoHeader) { + NS_INIT_REFCNT(); mOutput = &aStream; mLowerCaseTags = PR_TRUE; memset(mHTMLTagStack,0,sizeof(mHTMLTagStack)); diff --git a/htmlparser/src/nsHTMLToTXTSinkStream.cpp b/htmlparser/src/nsHTMLToTXTSinkStream.cpp new file mode 100644 index 00000000000..e730491ff5c --- /dev/null +++ b/htmlparser/src/nsHTMLToTXTSinkStream.cpp @@ -0,0 +1,626 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public License + * Version 1.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. Portions created by Netscape are Copyright (C) 1998 + * Netscape Communications Corporation. All Rights Reserved. + */ + +/** + * MODULE NOTES: + * + * This file declares the concrete TXT ContentSink class. + * This class is used during the parsing process as the + * primary interface between the parser and the content + * model. + */ + + +#include "nsHTMLToTXTSinkStream.h" +#include "nsHTMLTokens.h" +#include +#include "nsString.h" +#include "nsIParser.h" + +static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); +static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); +static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); + +const int gTabSize=2; +static char gBuffer[1024]; + +static PRBool IsInline(eHTMLTags aTag); +static PRBool IsBlockLevel(eHTMLTags aTag); + + + + + +/** + * This method gets called as part of our COM-like interfaces. + * Its purpose is to create an interface to parser object + * of some type. + * + * @update gpk02/03/99 + * @param nsIID id of object to discover + * @param aInstancePtr ptr to newly discovered interface + * @return NS_xxx result code + */ +nsresult +nsHTMLToTXTSinkStream::QueryInterface(const nsIID& aIID, void** aInstancePtr) +{ + if (NULL == aInstancePtr) { + return NS_ERROR_NULL_POINTER; + } + if(aIID.Equals(kISupportsIID)) { + *aInstancePtr = (nsIContentSink*)(this); + } + else if(aIID.Equals(kIContentSinkIID)) { + *aInstancePtr = (nsIContentSink*)(this); + } + else if(aIID.Equals(kIHTMLContentSinkIID)) { + *aInstancePtr = (nsIHTMLContentSink*)(this); + } + else { + *aInstancePtr=0; + return NS_NOINTERFACE; + } + NS_ADDREF_THIS(); + return NS_OK; +} + + +NS_IMPL_ADDREF(nsHTMLToTXTSinkStream) +NS_IMPL_RELEASE(nsHTMLToTXTSinkStream) + + +/** + * This method is defined in nsIParser. It is used to + * cause the COM-like construction of an nsParser. + * + * @update gpk02/03/99 + * @param nsIParser** ptr to newly instantiated parser + * @return NS_xxx error result + */ +NS_HTMLPARS nsresult +NS_New_HTMLToTXT_SinkStream(nsIHTMLContentSink** aInstancePtrResult) { + nsHTMLToTXTSinkStream* it = new nsHTMLToTXTSinkStream(); + if (nsnull == it) { + return NS_ERROR_OUT_OF_MEMORY; + } + + return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); +} + +/** + * Construct a content sink stream. + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { + NS_INIT_REFCNT(); + mOutput=&cout; + mColPos = 0; + mIndent = 0; + mDoOutput = PR_FALSE; +} + +/** + * Construct a content sink stream. + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { + NS_INIT_REFCNT(); + mOutput = &aStream; + mColPos = 0; + mIndent = 0; + mDoOutput = PR_FALSE; +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { + mOutput=0; //we don't own the stream we're given; just forget it. +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP_(void) +nsHTMLToTXTSinkStream::SetOutputStream(ostream& aStream){ + mOutput=&aStream; +} + + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void OpenTagWithAttributes(const char* theTag,const nsIParserNode& aNode,int tab,ostream& aStream,PRBool aNewline) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void OpenTag(const char* theTag,int tab,ostream& aStream,PRBool aNewline) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void CloseTag(const char* theTag,int tab,ostream& aStream) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void WritePair(eHTMLTags aTag,const nsString& theContent,int tab,ostream& aStream) { + const char* titleStr = GetTagName(aTag); + OpenTag(titleStr,tab,aStream,PR_FALSE); + theContent.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + CloseTag(titleStr,0,aStream); +} + +/** + * This method gets called by the parser when it encounters + * a title tag and wants to set the document title in the sink. + * + * @update gpk02/03/99 + * @param nsString reference to new title value + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::SetTitle(const nsString& aValue){ + return NS_OK; +} + + +/** + * This method is used to open the outer HTML container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenHTML(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer HTML container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseHTML(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the only HEAD container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenHead(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the only HEAD container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseHead(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the main BODY container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenBody(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the main BODY container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseBody(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open a new FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenForm(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseForm(const nsIParserNode& aNode){ + return NS_OK; +} + +/** + * This method is used to open a new FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenMap(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseMap(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the FRAMESET container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenFrameset(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the FRAMESET container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseFrameset(const nsIParserNode& aNode){ + return NS_OK; +} + + + + + + + +/** + * This gets called by the parser when you want to add + * a leaf node to the current container in the content + * model. + * + * @updated gpk 06/18/98 + * @param + * @return + */ +nsresult +nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) +{ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + + if (mDoOutput == PR_FALSE) + return NS_OK; + + if (type == eHTMLTag_text) { + const nsString& text = aNode.GetText(); + + text.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + mColPos += text.Length(); + } + else if (type == eHTMLTag_whitespace) + { + if (PR_TRUE) + { + const nsString& text = aNode.GetText(); + text.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + mColPos += text.Length(); + } + } + + return NS_OK; +} + + + +/** + * This gets called by the parser when you want to add + * a PI node to the current container in the content + * model. + * + * @updated gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddProcessingInstruction(const nsIParserNode& aNode){ + return NS_OK; +} + +/** + * This gets called by the parser when you want to add + * a comment node to the current container in the content + * model. + * + * @updated gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddComment(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to a general container. + * This includes: OL,UL,DIR,SPAN,TABLE,H[1..6],etc. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + const nsString& name = aNode.GetText(); + + if (type == eHTMLTag_body) + mDoOutput = PR_TRUE; + return NS_OK; +} + + +/** + * This method is used to close a generic container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseContainer(const nsIParserNode& aNode){ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + const nsString& name = aNode.GetText(); + + if (type == eHTMLTag_body) + mDoOutput = PR_FALSE; + + if (IsBlockLevel(type)) + { + if (mColPos != 0) + { + *mOutput << endl; + mColPos = 0; + } + } + return NS_OK; +} + + +/** + * This method is used to add a leaf to the currently + * open container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode){ + nsresult result = NS_OK; + if(mOutput) { + result = AddLeaf(aNode,*mOutput); + } + return result; +} + + +/** + * This method gets called when the parser begins the process + * of building the content model via the content sink. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillBuildModel(void){ + return NS_OK; +} + + +/** + * This method gets called when the parser concludes the process + * of building the content model via the content sink. + * + * @param aQualityLevel describes how well formed the doc was. + * 0=GOOD; 1=FAIR; 2=POOR; + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::DidBuildModel(PRInt32 aQualityLevel) { + return NS_OK; +} + + +/** + * This method gets called when the parser gets i/o blocked, + * and wants to notify the sink that it may be a while before + * more data is available. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillInterrupt(void) { + return NS_OK; +} + + +/** + * This method gets called when the parser i/o gets unblocked, + * and we're about to start dumping content again to the sink. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillResume(void) { + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLToTXTSinkStream::SetParser(nsIParser* aParser) { + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLToTXTSinkStream::NotifyError(nsresult aErrorResult) +{ + return NS_OK; +} + + +PRBool IsInline(eHTMLTags aTag) +{ + PRBool result = PR_FALSE; + + switch (aTag) + { + case eHTMLTag_a: + case eHTMLTag_address: + case eHTMLTag_big: + case eHTMLTag_blink: + case eHTMLTag_b: + case eHTMLTag_br: + case eHTMLTag_cite: + case eHTMLTag_code: + case eHTMLTag_dfn: + case eHTMLTag_em: + case eHTMLTag_font: + case eHTMLTag_img: + case eHTMLTag_i: + case eHTMLTag_kbd: + case eHTMLTag_keygen: + case eHTMLTag_nobr: + case eHTMLTag_samp: + case eHTMLTag_small: + case eHTMLTag_spacer: + case eHTMLTag_span: + case eHTMLTag_strike: + case eHTMLTag_strong: + case eHTMLTag_sub: + case eHTMLTag_sup: + case eHTMLTag_td: + case eHTMLTag_textarea: + case eHTMLTag_tt: + case eHTMLTag_var: + case eHTMLTag_wbr: + + result = PR_TRUE; + break; + + default: + break; + + } + return result; +} + +PRBool IsBlockLevel(eHTMLTags aTag) +{ + return !IsInline(aTag); +} + diff --git a/htmlparser/src/nsHTMLToTXTSinkStream.h b/htmlparser/src/nsHTMLToTXTSinkStream.h new file mode 100644 index 00000000000..743537c5df4 --- /dev/null +++ b/htmlparser/src/nsHTMLToTXTSinkStream.h @@ -0,0 +1,132 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* + * The contents of this file are subject to the Netscape Public License + * Version 1.0 (the "NPL"); you may not use this file except in + * compliance with the NPL. You may obtain a copy of the NPL at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the NPL is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL + * for the specific language governing rights and limitations under the + * NPL. + * + * The Initial Developer of this code under the NPL is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All Rights + * Reserved. + */ + +/** + * MODULE NOTES: + * + * If you've been paying attention to our many content sink classes, you may be + * asking yourself, "why do we need yet another one?" The answer is that this + * implementation, unlike all the others, really sends its output a given stream + * rather than to an actual content sink (as defined in our HTML document system). + * + * We use this class for a number of purposes: + * 1) For actual document i/o using XIF (xml interchange format) + * 2) For document conversions + * 3) For debug purposes (to cause output to go to cout or a file) + * + * If no stream is declared in the constructor then all output goes to cout. + * The file is pretty printed according to the pretty printing interface. subclasses + * may choose to override this behavior or set runtime flags for desired results. + */ + +#ifndef NS_HTMLTOTEXTSINK_STREAM +#define NS_HTMLTOTEXTSINK_STREAM + +#include "nsIParserNode.h" +#include "nsIHTMLContentSink.h" +#include "nshtmlpars.h" +#include "nsHTMLTokens.h" + + +#define NS_HTMLTOTEXTSINK_STREAM_IID \ + {0xa39c6bff, 0x15f0, 0x11d2, \ + {0x80, 0x41, 0x0, 0x10, 0x4b, 0x98, 0x3f, 0xd4}} + +#ifndef XP_MAC +class ostream; +#endif + +class nsHTMLToTXTSinkStream : public nsIHTMLContentSink { + public: + + /** + * Standard constructor + * @update gpk02/03/99 + */ + nsHTMLToTXTSinkStream(); + nsHTMLToTXTSinkStream(ostream& aStream); + + /** + * virtual destructor + * @update gpk02/03/99 + */ + virtual ~nsHTMLToTXTSinkStream(); + + NS_IMETHOD_(void) SetOutputStream(ostream& aStream); + + // nsISupports + NS_DECL_ISUPPORTS + + /******************************************************************* + * The following methods are inherited from nsIContentSink. + * Please see that file for details. + *******************************************************************/ + NS_IMETHOD WillBuildModel(void); + NS_IMETHOD DidBuildModel(PRInt32 aQualityLevel); + NS_IMETHOD WillInterrupt(void); + NS_IMETHOD WillResume(void); + NS_IMETHOD SetParser(nsIParser* aParser); + NS_IMETHOD OpenContainer(const nsIParserNode& aNode); + NS_IMETHOD CloseContainer(const nsIParserNode& aNode); + NS_IMETHOD AddLeaf(const nsIParserNode& aNode); + NS_IMETHOD NotifyError(nsresult aErrorResult); + NS_IMETHOD AddComment(const nsIParserNode& aNode); + NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode); + + /******************************************************************* + * The following methods are inherited from nsIHTMLContentSink. + * Please see that file for details. + *******************************************************************/ + NS_IMETHOD SetTitle(const nsString& aValue); + NS_IMETHOD OpenHTML(const nsIParserNode& aNode); + NS_IMETHOD CloseHTML(const nsIParserNode& aNode); + NS_IMETHOD OpenHead(const nsIParserNode& aNode); + NS_IMETHOD CloseHead(const nsIParserNode& aNode); + NS_IMETHOD OpenBody(const nsIParserNode& aNode); + NS_IMETHOD CloseBody(const nsIParserNode& aNode); + NS_IMETHOD OpenForm(const nsIParserNode& aNode); + NS_IMETHOD CloseForm(const nsIParserNode& aNode); + NS_IMETHOD OpenMap(const nsIParserNode& aNode); + NS_IMETHOD CloseMap(const nsIParserNode& aNode); + NS_IMETHOD OpenFrameset(const nsIParserNode& aNode); + NS_IMETHOD CloseFrameset(const nsIParserNode& aNode); + + +protected: + + nsresult AddLeaf(const nsIParserNode& aNode, ostream& aStream); + void WriteAttributes(const nsIParserNode& aNode,ostream& aStream); + + +protected: + ostream* mOutput; + PRInt32 mIndent; + PRInt32 mColPos; + PRBool mDoOutput; + +}; + +extern NS_HTMLPARS nsresult +NS_New_HTMLToTXT_SinkStream(nsIHTMLContentSink** aInstancePtrResult); + + +#endif + + + + diff --git a/parser/htmlparser/src/MANIFEST b/parser/htmlparser/src/MANIFEST index 1509c6a29a8..c327b9697e8 100644 --- a/parser/htmlparser/src/MANIFEST +++ b/parser/htmlparser/src/MANIFEST @@ -6,6 +6,7 @@ nshtmlpars.h nsIContentSink.h nsIHTMLContentSink.h nsHTMLContentSinkStream.h +nsHTMLToTXTSinkStream.h nsITokenizer.h nsHTMLTokens.h nsIParserNode.h diff --git a/parser/htmlparser/src/Makefile.in b/parser/htmlparser/src/Makefile.in index 1a41798853e..fed1f858d58 100644 --- a/parser/htmlparser/src/Makefile.in +++ b/parser/htmlparser/src/Makefile.in @@ -50,6 +50,7 @@ CPPSRCS= \ nsToken.cpp \ nsTokenHandler.cpp \ nsHTMLContentSinkStream.cpp \ + nsHTMLToTXTSinkStream.cpp \ nsValidDTD.cpp \ nsWellFormedDTD.cpp \ nsViewSourceHTML.cpp \ @@ -64,6 +65,7 @@ EXPORTS = \ nsIExpatTokenizer.h \ nsIHTMLContentSink.h \ nsHTMLContentSinkStream.h \ + nsHTMLToTXTSinkStream.h \ nsHTMLEntities.h \ nsHTMLTokens.h \ nsILoggingSink.h \ diff --git a/parser/htmlparser/src/makefile.win b/parser/htmlparser/src/makefile.win index 24972b41bdd..0f9ad6896ab 100644 --- a/parser/htmlparser/src/makefile.win +++ b/parser/htmlparser/src/makefile.win @@ -46,6 +46,7 @@ CPPSRCS= \ nsToken.cpp \ nsTokenHandler.cpp \ nsHTMLContentSinkStream.cpp \ + nsHTMLToTXTSinkStream.cpp \ nsValidDTD.cpp \ nsWellFormedDTD.cpp \ nsViewSourceHTML.cpp \ @@ -77,6 +78,7 @@ CPP_OBJS= \ .\$(OBJDIR)\nsToken.obj \ .\$(OBJDIR)\nsTokenHandler.obj \ .\$(OBJDIR)\nsHTMLContentSinkStream.obj \ + .\$(OBJDIR)\nsHTMLToTXTSinkStream.obj \ .\$(OBJDIR)\nsValidDTD.obj \ .\$(OBJDIR)\nsWellFormedDTD.obj \ .\$(OBJDIR)\nsViewSourceHTML.obj \ @@ -93,6 +95,7 @@ EXPORTS= \ nsIHTMLContentSink.h \ nsILoggingSink.h \ nsHTMLContentSinkStream.h \ + nsHTMLToTXTSinkStream.h \ nsHTMLEntities.h \ nsHTMLTokens.h \ nsIParserNode.h \ diff --git a/parser/htmlparser/src/nsHTMLContentSinkStream.cpp b/parser/htmlparser/src/nsHTMLContentSinkStream.cpp index 509bf3caf0e..d8804f8dff1 100644 --- a/parser/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/parser/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -41,28 +41,154 @@ static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); static char* gHeaderComment = ""; static char* gDocTypeHeader = ""; const int gTabSize=2; -static char gBuffer[500]; - - +static char gBuffer[1024]; /** PRETTY PRINTING PROTOTYPES **/ + +class nsTagFormat +{ +public: + void Init(PRBool aBefore, PRBool aStart, PRBool aEnd, PRBool aAfter); + void SetIndentGroup(PRUint8 aGroup); + void SetFormat(PRBool aOnOff); + +public: + PRBool mBreakBefore; + PRBool mBreakStart; + PRBool mBreakEnd; + PRBool mBreakAfter; + + PRUint8 mIndentGroup; // zero for none + PRBool mFormat; // format (on|off) +}; + +void nsTagFormat::Init(PRBool aBefore, PRBool aStart, PRBool aEnd, PRBool aAfter) +{ + mBreakBefore = aBefore; + mBreakStart = aStart; + mBreakEnd = aEnd; + mBreakAfter = aAfter; + mFormat = PR_TRUE; +} + +void nsTagFormat::SetIndentGroup(PRUint8 aGroup) +{ + mIndentGroup = aGroup; +} + +void nsTagFormat::SetFormat(PRBool aOnOff) +{ + mFormat = aOnOff; +} + +class nsPrettyPrinter +{ +public: + + void Init(PRBool aIndentEnable = PR_TRUE, PRUint8 aColSize = 2, PRUint8 aTabSize = 8, PRBool aUseTabs = PR_FALSE ); + + PRBool mIndentEnable; + PRUint8 mIndentColSize; + PRUint8 mIndentTabSize; + PRBool mIndentUseTabs; + + PRBool mAutowrapEnable; + PRUint32 mAutoWrapColWidth; + nsString mBreak; // CRLF, CR, LF + + nsTagFormat mTagFormat[NS_HTML_TAG_MAX+1]; +}; + + +void nsPrettyPrinter::Init(PRBool aIndentEnable, PRUint8 aColSize, PRUint8 aTabSize, PRBool aUseTabs) +{ + mIndentEnable = aIndentEnable; + mIndentColSize = aColSize; + mIndentTabSize = aTabSize; + mIndentUseTabs = aUseTabs; + + mAutowrapEnable = PR_TRUE; + mAutoWrapColWidth = 72; + mBreak = "\n"; // CRLF, CR, LF + + for (PRUint32 i = 0; i < NS_HTML_TAG_MAX; i++) + mTagFormat[i].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + + mTagFormat[eHTMLTag_a].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_abbr].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + mTagFormat[eHTMLTag_applet].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_area].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_b].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_FALSE); + mTagFormat[eHTMLTag_base].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_blockquote].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_body].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_br].Init(PR_FALSE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_caption].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_center].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_dd].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_dir].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_div].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_dl].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_dt].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_embed].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_form].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_frame].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_frameset].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h1].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h2].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h3].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h4].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h5].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_h6].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_head].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_hr].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_html].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_ilayer].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_input].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_isindex].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_layer].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_li].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_link].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_map].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_menu].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_meta].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_object].Init(PR_FALSE,PR_TRUE,PR_TRUE,PR_FALSE); + mTagFormat[eHTMLTag_ol].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_option].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_p].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_param].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_pre].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_script].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_select].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_style].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_table].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); + mTagFormat[eHTMLTag_td].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_textarea].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_th].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_title].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_tr].Init(PR_TRUE,PR_FALSE,PR_FALSE,PR_TRUE); + mTagFormat[eHTMLTag_ul].Init(PR_TRUE,PR_TRUE,PR_TRUE,PR_TRUE); +} + + + -PRBool IsInline(eHTMLTags aTag); -PRBool IsBlockLevel(eHTMLTags aTag); -PRInt32 BreakBeforeOpen(eHTMLTags aTag); -PRInt32 BreakAfterOpen(eHTMLTags aTag); -PRInt32 BreakBeforeClose(eHTMLTags aTag); -PRInt32 BreakAfterClose(eHTMLTags aTag); -PRBool IndentChildren(eHTMLTags aTag); -PRBool PreformattedChildren(eHTMLTags aTag); -PRBool EatOpen(eHTMLTags aTag); -PRBool EatClose(eHTMLTags aTag); -PRBool PermitWSBeforeOpen(eHTMLTags aTag); -PRBool PermitWSAfterOpen(eHTMLTags aTag); -PRBool PermitWSBeforeClose(eHTMLTags aTag); -PRBool PermitWSAfterClose(eHTMLTags aTag); -PRBool IgnoreWS(eHTMLTags aTag); +static PRBool IsInline(eHTMLTags aTag); +static PRBool IsBlockLevel(eHTMLTags aTag); +static PRInt32 BreakBeforeOpen(eHTMLTags aTag); +static PRInt32 BreakAfterOpen(eHTMLTags aTag); +static PRInt32 BreakBeforeClose(eHTMLTags aTag); +static PRInt32 BreakAfterClose(eHTMLTags aTag); +static PRBool IndentChildren(eHTMLTags aTag); +static PRBool PreformattedChildren(eHTMLTags aTag); +static PRBool EatOpen(eHTMLTags aTag); +static PRBool EatClose(eHTMLTags aTag); +static PRBool PermitWSBeforeOpen(eHTMLTags aTag); +static PRBool PermitWSAfterOpen(eHTMLTags aTag); +static PRBool PermitWSBeforeClose(eHTMLTags aTag); +static PRBool PermitWSAfterClose(eHTMLTags aTag); +static PRBool IgnoreWS(eHTMLTags aTag); @@ -150,6 +276,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead * @return */ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoFormat,PRBool aDoHeader) { + NS_INIT_REFCNT(); mOutput = &aStream; mLowerCaseTags = PR_TRUE; memset(mHTMLTagStack,0,sizeof(mHTMLTagStack)); diff --git a/parser/htmlparser/src/nsHTMLToTXTSinkStream.cpp b/parser/htmlparser/src/nsHTMLToTXTSinkStream.cpp new file mode 100644 index 00000000000..e730491ff5c --- /dev/null +++ b/parser/htmlparser/src/nsHTMLToTXTSinkStream.cpp @@ -0,0 +1,626 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public License + * Version 1.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. Portions created by Netscape are Copyright (C) 1998 + * Netscape Communications Corporation. All Rights Reserved. + */ + +/** + * MODULE NOTES: + * + * This file declares the concrete TXT ContentSink class. + * This class is used during the parsing process as the + * primary interface between the parser and the content + * model. + */ + + +#include "nsHTMLToTXTSinkStream.h" +#include "nsHTMLTokens.h" +#include +#include "nsString.h" +#include "nsIParser.h" + +static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); +static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); +static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); + +const int gTabSize=2; +static char gBuffer[1024]; + +static PRBool IsInline(eHTMLTags aTag); +static PRBool IsBlockLevel(eHTMLTags aTag); + + + + + +/** + * This method gets called as part of our COM-like interfaces. + * Its purpose is to create an interface to parser object + * of some type. + * + * @update gpk02/03/99 + * @param nsIID id of object to discover + * @param aInstancePtr ptr to newly discovered interface + * @return NS_xxx result code + */ +nsresult +nsHTMLToTXTSinkStream::QueryInterface(const nsIID& aIID, void** aInstancePtr) +{ + if (NULL == aInstancePtr) { + return NS_ERROR_NULL_POINTER; + } + if(aIID.Equals(kISupportsIID)) { + *aInstancePtr = (nsIContentSink*)(this); + } + else if(aIID.Equals(kIContentSinkIID)) { + *aInstancePtr = (nsIContentSink*)(this); + } + else if(aIID.Equals(kIHTMLContentSinkIID)) { + *aInstancePtr = (nsIHTMLContentSink*)(this); + } + else { + *aInstancePtr=0; + return NS_NOINTERFACE; + } + NS_ADDREF_THIS(); + return NS_OK; +} + + +NS_IMPL_ADDREF(nsHTMLToTXTSinkStream) +NS_IMPL_RELEASE(nsHTMLToTXTSinkStream) + + +/** + * This method is defined in nsIParser. It is used to + * cause the COM-like construction of an nsParser. + * + * @update gpk02/03/99 + * @param nsIParser** ptr to newly instantiated parser + * @return NS_xxx error result + */ +NS_HTMLPARS nsresult +NS_New_HTMLToTXT_SinkStream(nsIHTMLContentSink** aInstancePtrResult) { + nsHTMLToTXTSinkStream* it = new nsHTMLToTXTSinkStream(); + if (nsnull == it) { + return NS_ERROR_OUT_OF_MEMORY; + } + + return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); +} + +/** + * Construct a content sink stream. + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { + NS_INIT_REFCNT(); + mOutput=&cout; + mColPos = 0; + mIndent = 0; + mDoOutput = PR_FALSE; +} + +/** + * Construct a content sink stream. + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { + NS_INIT_REFCNT(); + mOutput = &aStream; + mColPos = 0; + mIndent = 0; + mDoOutput = PR_FALSE; +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { + mOutput=0; //we don't own the stream we're given; just forget it. +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP_(void) +nsHTMLToTXTSinkStream::SetOutputStream(ostream& aStream){ + mOutput=&aStream; +} + + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void OpenTagWithAttributes(const char* theTag,const nsIParserNode& aNode,int tab,ostream& aStream,PRBool aNewline) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void OpenTag(const char* theTag,int tab,ostream& aStream,PRBool aNewline) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void CloseTag(const char* theTag,int tab,ostream& aStream) { +} + + +/** + * + * @update gpk02/03/99 + * @param + * @return + */ +static +void WritePair(eHTMLTags aTag,const nsString& theContent,int tab,ostream& aStream) { + const char* titleStr = GetTagName(aTag); + OpenTag(titleStr,tab,aStream,PR_FALSE); + theContent.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + CloseTag(titleStr,0,aStream); +} + +/** + * This method gets called by the parser when it encounters + * a title tag and wants to set the document title in the sink. + * + * @update gpk02/03/99 + * @param nsString reference to new title value + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::SetTitle(const nsString& aValue){ + return NS_OK; +} + + +/** + * This method is used to open the outer HTML container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenHTML(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer HTML container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseHTML(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the only HEAD container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenHead(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the only HEAD container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseHead(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the main BODY container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenBody(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the main BODY container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseBody(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open a new FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenForm(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseForm(const nsIParserNode& aNode){ + return NS_OK; +} + +/** + * This method is used to open a new FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenMap(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the outer FORM container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseMap(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to open the FRAMESET container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenFrameset(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to close the FRAMESET container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseFrameset(const nsIParserNode& aNode){ + return NS_OK; +} + + + + + + + +/** + * This gets called by the parser when you want to add + * a leaf node to the current container in the content + * model. + * + * @updated gpk 06/18/98 + * @param + * @return + */ +nsresult +nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) +{ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + + if (mDoOutput == PR_FALSE) + return NS_OK; + + if (type == eHTMLTag_text) { + const nsString& text = aNode.GetText(); + + text.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + mColPos += text.Length(); + } + else if (type == eHTMLTag_whitespace) + { + if (PR_TRUE) + { + const nsString& text = aNode.GetText(); + text.ToCString(gBuffer,sizeof(gBuffer)-1); + aStream << gBuffer; + mColPos += text.Length(); + } + } + + return NS_OK; +} + + + +/** + * This gets called by the parser when you want to add + * a PI node to the current container in the content + * model. + * + * @updated gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddProcessingInstruction(const nsIParserNode& aNode){ + return NS_OK; +} + +/** + * This gets called by the parser when you want to add + * a comment node to the current container in the content + * model. + * + * @updated gpk02/03/99 + * @param + * @return + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddComment(const nsIParserNode& aNode){ + return NS_OK; +} + + +/** + * This method is used to a general container. + * This includes: OL,UL,DIR,SPAN,TABLE,H[1..6],etc. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + const nsString& name = aNode.GetText(); + + if (type == eHTMLTag_body) + mDoOutput = PR_TRUE; + return NS_OK; +} + + +/** + * This method is used to close a generic container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::CloseContainer(const nsIParserNode& aNode){ + eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); + const nsString& name = aNode.GetText(); + + if (type == eHTMLTag_body) + mDoOutput = PR_FALSE; + + if (IsBlockLevel(type)) + { + if (mColPos != 0) + { + *mOutput << endl; + mColPos = 0; + } + } + return NS_OK; +} + + +/** + * This method is used to add a leaf to the currently + * open container. + * + * @update 07/12/98 gpk + * @param nsIParserNode reference to parser node interface + * @return PR_TRUE if successful. + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode){ + nsresult result = NS_OK; + if(mOutput) { + result = AddLeaf(aNode,*mOutput); + } + return result; +} + + +/** + * This method gets called when the parser begins the process + * of building the content model via the content sink. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillBuildModel(void){ + return NS_OK; +} + + +/** + * This method gets called when the parser concludes the process + * of building the content model via the content sink. + * + * @param aQualityLevel describes how well formed the doc was. + * 0=GOOD; 1=FAIR; 2=POOR; + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::DidBuildModel(PRInt32 aQualityLevel) { + return NS_OK; +} + + +/** + * This method gets called when the parser gets i/o blocked, + * and wants to notify the sink that it may be a while before + * more data is available. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillInterrupt(void) { + return NS_OK; +} + + +/** + * This method gets called when the parser i/o gets unblocked, + * and we're about to start dumping content again to the sink. + * + * @update gpk02/03/99 + */ +NS_IMETHODIMP +nsHTMLToTXTSinkStream::WillResume(void) { + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLToTXTSinkStream::SetParser(nsIParser* aParser) { + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLToTXTSinkStream::NotifyError(nsresult aErrorResult) +{ + return NS_OK; +} + + +PRBool IsInline(eHTMLTags aTag) +{ + PRBool result = PR_FALSE; + + switch (aTag) + { + case eHTMLTag_a: + case eHTMLTag_address: + case eHTMLTag_big: + case eHTMLTag_blink: + case eHTMLTag_b: + case eHTMLTag_br: + case eHTMLTag_cite: + case eHTMLTag_code: + case eHTMLTag_dfn: + case eHTMLTag_em: + case eHTMLTag_font: + case eHTMLTag_img: + case eHTMLTag_i: + case eHTMLTag_kbd: + case eHTMLTag_keygen: + case eHTMLTag_nobr: + case eHTMLTag_samp: + case eHTMLTag_small: + case eHTMLTag_spacer: + case eHTMLTag_span: + case eHTMLTag_strike: + case eHTMLTag_strong: + case eHTMLTag_sub: + case eHTMLTag_sup: + case eHTMLTag_td: + case eHTMLTag_textarea: + case eHTMLTag_tt: + case eHTMLTag_var: + case eHTMLTag_wbr: + + result = PR_TRUE; + break; + + default: + break; + + } + return result; +} + +PRBool IsBlockLevel(eHTMLTags aTag) +{ + return !IsInline(aTag); +} + diff --git a/parser/htmlparser/src/nsHTMLToTXTSinkStream.h b/parser/htmlparser/src/nsHTMLToTXTSinkStream.h new file mode 100644 index 00000000000..743537c5df4 --- /dev/null +++ b/parser/htmlparser/src/nsHTMLToTXTSinkStream.h @@ -0,0 +1,132 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* + * The contents of this file are subject to the Netscape Public License + * Version 1.0 (the "NPL"); you may not use this file except in + * compliance with the NPL. You may obtain a copy of the NPL at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the NPL is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL + * for the specific language governing rights and limitations under the + * NPL. + * + * The Initial Developer of this code under the NPL is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All Rights + * Reserved. + */ + +/** + * MODULE NOTES: + * + * If you've been paying attention to our many content sink classes, you may be + * asking yourself, "why do we need yet another one?" The answer is that this + * implementation, unlike all the others, really sends its output a given stream + * rather than to an actual content sink (as defined in our HTML document system). + * + * We use this class for a number of purposes: + * 1) For actual document i/o using XIF (xml interchange format) + * 2) For document conversions + * 3) For debug purposes (to cause output to go to cout or a file) + * + * If no stream is declared in the constructor then all output goes to cout. + * The file is pretty printed according to the pretty printing interface. subclasses + * may choose to override this behavior or set runtime flags for desired results. + */ + +#ifndef NS_HTMLTOTEXTSINK_STREAM +#define NS_HTMLTOTEXTSINK_STREAM + +#include "nsIParserNode.h" +#include "nsIHTMLContentSink.h" +#include "nshtmlpars.h" +#include "nsHTMLTokens.h" + + +#define NS_HTMLTOTEXTSINK_STREAM_IID \ + {0xa39c6bff, 0x15f0, 0x11d2, \ + {0x80, 0x41, 0x0, 0x10, 0x4b, 0x98, 0x3f, 0xd4}} + +#ifndef XP_MAC +class ostream; +#endif + +class nsHTMLToTXTSinkStream : public nsIHTMLContentSink { + public: + + /** + * Standard constructor + * @update gpk02/03/99 + */ + nsHTMLToTXTSinkStream(); + nsHTMLToTXTSinkStream(ostream& aStream); + + /** + * virtual destructor + * @update gpk02/03/99 + */ + virtual ~nsHTMLToTXTSinkStream(); + + NS_IMETHOD_(void) SetOutputStream(ostream& aStream); + + // nsISupports + NS_DECL_ISUPPORTS + + /******************************************************************* + * The following methods are inherited from nsIContentSink. + * Please see that file for details. + *******************************************************************/ + NS_IMETHOD WillBuildModel(void); + NS_IMETHOD DidBuildModel(PRInt32 aQualityLevel); + NS_IMETHOD WillInterrupt(void); + NS_IMETHOD WillResume(void); + NS_IMETHOD SetParser(nsIParser* aParser); + NS_IMETHOD OpenContainer(const nsIParserNode& aNode); + NS_IMETHOD CloseContainer(const nsIParserNode& aNode); + NS_IMETHOD AddLeaf(const nsIParserNode& aNode); + NS_IMETHOD NotifyError(nsresult aErrorResult); + NS_IMETHOD AddComment(const nsIParserNode& aNode); + NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode); + + /******************************************************************* + * The following methods are inherited from nsIHTMLContentSink. + * Please see that file for details. + *******************************************************************/ + NS_IMETHOD SetTitle(const nsString& aValue); + NS_IMETHOD OpenHTML(const nsIParserNode& aNode); + NS_IMETHOD CloseHTML(const nsIParserNode& aNode); + NS_IMETHOD OpenHead(const nsIParserNode& aNode); + NS_IMETHOD CloseHead(const nsIParserNode& aNode); + NS_IMETHOD OpenBody(const nsIParserNode& aNode); + NS_IMETHOD CloseBody(const nsIParserNode& aNode); + NS_IMETHOD OpenForm(const nsIParserNode& aNode); + NS_IMETHOD CloseForm(const nsIParserNode& aNode); + NS_IMETHOD OpenMap(const nsIParserNode& aNode); + NS_IMETHOD CloseMap(const nsIParserNode& aNode); + NS_IMETHOD OpenFrameset(const nsIParserNode& aNode); + NS_IMETHOD CloseFrameset(const nsIParserNode& aNode); + + +protected: + + nsresult AddLeaf(const nsIParserNode& aNode, ostream& aStream); + void WriteAttributes(const nsIParserNode& aNode,ostream& aStream); + + +protected: + ostream* mOutput; + PRInt32 mIndent; + PRInt32 mColPos; + PRBool mDoOutput; + +}; + +extern NS_HTMLPARS nsresult +NS_New_HTMLToTXT_SinkStream(nsIHTMLContentSink** aInstancePtrResult); + + +#endif + + + +