gecko-dev/webshell/tests/viewer/nsWebCrawler.h

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Mozilla Communicator client code.
 *
 * The Initial Developer of the Original Code is Netscape Communications
 * Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation. All
 * Rights Reserved.
 *
 * Contributor(s): 
 */
#ifndef nsWebCrawler_h___
#define nsWebCrawler_h___

#include "nsCOMPtr.h"
#include "nsBrowserWindow.h"
#include "nsIWebProgressListener.h"
#include "nsVoidArray.h"
#include "nsString.h"
#include "nsIAtom.h"
#include "nsWeakReference.h"


class nsIContent;
class nsIDocument;
class nsITimer;
class nsIURI;
class nsIPresShell;
class nsViewerApp;
class AtomHashTable;

class nsWebCrawler : public nsIWebProgressListener,
                     public nsSupportsWeakReference {
public:
  // Make a new web-crawler for the given viewer. Note: the web
  // crawler does not addref the viewer.
  nsWebCrawler(nsViewerApp* aViewer);

  // nsISupports
  NS_DECL_ISUPPORTS

  // nsIWebProgressListener
  NS_DECL_NSIWEBPROGRESSLISTENER

  // Add a url to load
  void AddURL(const nsString& aURL);

  // Add a domain that is safe to load url's from
  void AddSafeDomain(const nsString& aDomain);

  // Add a domain that must be avoided
  void AddAvoidDomain(const nsString& aDomain);

  void SetBrowserWindow(nsBrowserWindow* aWindow);
  void GetBrowserWindow(nsBrowserWindow** aWindow);

  // Set the delay (by default, the timer is set to one second)
  void SetDelay(PRInt32 aSeconds) {
    mDelay = aSeconds;
  }

  void SetPrintTest(PRInt32 aTestType) { mPrinterTestType = aTestType; }

  void RegressionOutput(PRInt32 aRegressionOutputLevel) { mRegressionOutputLevel = aRegressionOutputLevel; }

  void EnableJiggleLayout() {
    mJiggleLayout = PR_TRUE;
  }

  // If set to TRUE the loader will post an exit message on exit
  void SetExitOnDone(PRBool aPostExit) {
    mPostExit = aPostExit;
  }

  // Start loading documents
  void Start();

  // Enable the crawler; when a document contains links to other
  // documents the crawler will go to them subject to the limitations
  // on the total crawl count and the domain name checks.
  void EnableCrawler();

  void SetRecordFile(FILE* aFile) {
    mRecord = aFile;
  }

  void SetMaxPages(PRInt32 aMax) {
    mMaxPages = aMax;
  }

  void SetOutputDir(const nsString& aOutputDir);

  void DumpRegressionData(nsIWebShell* aWebShell,
                          nsIURI*      aURL);
  void SetRegressionDir(const nsString& aOutputDir);

  void SetEnableRegression(PRBool aSetting) {
    mRegressing = aSetting;
  }

  void LoadNextURL(PRBool aQueueLoad);

  nsresult QueueLoadURL(const nsString& aURL);

  void GoToQueuedURL(const nsString& aURL);

  void QueueExit();

  void Exit();

  void SetVerbose(PRBool aSetting) {
    mVerbose = aSetting;
  }

  PRBool Crawling() const {
    return mCrawl;
  }

  PRBool LoadingURLList() const {
    return mHaveURLList;
  }

  void IncludeStyleData(PRBool aIncludeStyle) {
    mIncludeStyleInfo = aIncludeStyle;
  }

protected:
  virtual ~nsWebCrawler();

  void FindURLsIn(nsIDocument* aDocument, nsIContent* aNode);

  void FindMoreURLs();

  PRBool OkToLoad(const nsString& aURLSpec);

  void RecordLoadedURL(const nsString& aURLSpec);

  /** generate an output name from a URL */
  FILE* GetOutputFile(nsIURI *aURL, nsString& aOutputName);

  nsIPresShell* GetPresShell(nsIWebShell* aWebShell = nsnull);

  void PerformRegressionTest(const nsString& aOutputName);

  nsBrowserWindow* mBrowser;
  nsViewerApp* mViewer;
  nsCOMPtr<nsITimer> mTimer;
  FILE* mRecord;
  nsCOMPtr<nsIAtom> mLinkTag;
  nsCOMPtr<nsIAtom> mFrameTag;
  nsCOMPtr<nsIAtom> mIFrameTag;
  nsCOMPtr<nsIAtom> mHrefAttr;
  nsCOMPtr<nsIAtom> mSrcAttr;
  nsCOMPtr<nsIAtom> mBaseHrefAttr;
  AtomHashTable* mVisited;
  nsString mOutputDir;

  PRBool mCrawl;
  PRBool mHaveURLList;
  PRBool mJiggleLayout;
  PRBool mPostExit;
  PRInt32 mDelay;     // first delay encountered from command line or delay:= in file
  PRInt32 mLastDelay; // last  delay encountered from command line or delay:= in file 
  PRInt32 mMaxPages;

  nsString mCurrentURL;
  nsCOMPtr<nsIURI>  mLastURL;
  nsIWebShell* mLastWebShell;

  PRTime mStartLoad;
  PRBool mVerbose;
  PRBool mRegressing;
  PRInt32 mPrinterTestType;
  PRInt32 mRegressionOutputLevel;
  nsString mRegressionDir;
  PRBool mIncludeStyleInfo;

  nsVoidArray mPendingURLs;
  nsVoidArray mSafeDomains;
  nsVoidArray mAvoidDomains;

  PRInt32 mQueuedLoadURLs;
};

#endif /* nsWebCrawler_h___ */
time to crawl 1998-07-28 02:16:13 +04:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 --`
			`*`
updated xPL license boilerplate to v1.1, a=chofmann@netscape.com,r=endico@mozilla.org 1999-11-06 06:43:54 +03:00			`* The contents of this file are subject to the Netscape Public`
			`* License Version 1.1 (the "License"); you may not use this file`
			`* except in compliance with the License. You may obtain a copy of`
			`* the License at http://www.mozilla.org/NPL/`
time to crawl 1998-07-28 02:16:13 +04:00			`*`
updated xPL license boilerplate to v1.1, a=chofmann@netscape.com,r=endico@mozilla.org 1999-11-06 06:43:54 +03:00			`* Software distributed under the License is distributed on an "AS`
			`* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or`
			`* implied. See the License for the specific language governing`
			`* rights and limitations under the License.`
time to crawl 1998-07-28 02:16:13 +04:00			`*`
			`* The Original Code is Mozilla Communicator client code.`
			`*`
			`* The Initial Developer of the Original Code is Netscape Communications`
updated xPL license boilerplate to v1.1, a=chofmann@netscape.com,r=endico@mozilla.org 1999-11-06 06:43:54 +03:00			`* Corporation. Portions created by Netscape are`
			`* Copyright (C) 1998 Netscape Communications Corporation. All`
			`* Rights Reserved.`
			`*`
			`* Contributor(s):`
time to crawl 1998-07-28 02:16:13 +04:00			`*/`
			`#ifndef nsWebCrawler_h___`
			`#define nsWebCrawler_h___`

r=dp,vidur,beard; close up some memory leaks related to shutdown of the viewer 1999-10-05 03:41:25 +04:00			`#include "nsCOMPtr.h"`
Changed nsIBrowserWindow references to reference nsBrowserWindow. nsBrowserWindow is no longer a component. More implementation to get nsBrowserWindow and nsWebBrowserChrome implementations geared towards the embedding code. made nsBrowserWindow implement nsIBaseWindow. 2000-03-14 14:08:43 +03:00			`#include "nsBrowserWindow.h"`
r=waterson, sr=rpotts. part of 15345. removing unused nsIDocLoaderObserver from nsBrowserWindow and making the webcrawler use nsIWebProgressListener instead of nsIDocLoaderObserver which is depricated. 2001-05-03 03:58:42 +04:00			`#include "nsIWebProgressListener.h"`
time to crawl 1998-07-28 02:16:13 +04:00			`#include "nsVoidArray.h"`
			`#include "nsString.h"`
r=dp,vidur,beard; close up some memory leaks related to shutdown of the viewer 1999-10-05 03:41:25 +04:00			`#include "nsIAtom.h"`
r=waterson, sr=rpotts. part of 15345. removing unused nsIDocLoaderObserver from nsBrowserWindow and making the webcrawler use nsIWebProgressListener instead of nsIDocLoaderObserver which is depricated. 2001-05-03 03:58:42 +04:00			`#include "nsWeakReference.h"`

time to crawl 1998-07-28 02:16:13 +04:00
			`class nsIContent;`
			`class nsIDocument;`
			`class nsITimer;`
Renamed nsIURL to nsIURI in preperation for necko. More NECKO ifdefs too. 1999-06-23 07:29:44 +04:00			`class nsIURI;`
nsBrowserWindow.cpp,h added filter name param to DumpFrames nsViewerApp.cpp added -o <outputDirPath> for specifying where output files should go added -filter <filterName> for specifying that <filterName> should be used in an automated fashion for dumping frames for every URL visited. nsWebCrawler.cpp,h added support methods to handle output dirs, output file name building from URL, and filter names added DumpFrames to ::OnStopBinding when the right state is set (from -filter option) added dependancies on: nsIPresShell.h, nsIPresContext.h, and nsIFrame.h 1998-09-03 02:07:42 +04:00			`class nsIPresShell;`
time to crawl 1998-07-28 02:16:13 +04:00			`class nsViewerApp;`
			`class AtomHashTable;`

r=waterson, sr=rpotts. part of 15345. removing unused nsIDocLoaderObserver from nsBrowserWindow and making the webcrawler use nsIWebProgressListener instead of nsIDocLoaderObserver which is depricated. 2001-05-03 03:58:42 +04:00			`class nsWebCrawler : public nsIWebProgressListener,`
			`public nsSupportsWeakReference {`
time to crawl 1998-07-28 02:16:13 +04:00			`public:`
Removed obsolete -filter argument 1999-02-08 20:57:00 +03:00			`// Make a new web-crawler for the given viewer. Note: the web`
			`// crawler does not addref the viewer.`
time to crawl 1998-07-28 02:16:13 +04:00			`nsWebCrawler(nsViewerApp* aViewer);`

			`// nsISupports`
			`NS_DECL_ISUPPORTS`

sr=rpotts. cookie/wallet r=morse, xmlTerm r=saravn@mozdev.org, mailnews r=mscott, activeX r=adamlock, everything else covered by the sr from rpotts. 15345. These changes remove nsIDocumentLoaderObserver.idl from the build, and all of it's implementations and registrations have been moved over to nsIWebProgressListener.idl and nsIWebProgress.idl respectively. there are two (78762, 78760) mailnews printing bugs that I overturned before landing this. I've run this code through the mail-news and browser smoketests among other tests outlined in the patch description attatched to the bug. 2001-05-05 00:15:38 +04:00			`// nsIWebProgressListener`
r=waterson, sr=rpotts. part of 15345. removing unused nsIDocLoaderObserver from nsBrowserWindow and making the webcrawler use nsIWebProgressListener instead of nsIDocLoaderObserver which is depricated. 2001-05-03 03:58:42 +04:00			`NS_DECL_NSIWEBPROGRESSLISTENER`
time to crawl 1998-07-28 02:16:13 +04:00
			`// Add a url to load`
			`void AddURL(const nsString& aURL);`

			`// Add a domain that is safe to load url's from`
			`void AddSafeDomain(const nsString& aDomain);`

			`// Add a domain that must be avoided`
			`void AddAvoidDomain(const nsString& aDomain);`

Changed nsIBrowserWindow references to reference nsBrowserWindow. nsBrowserWindow is no longer a component. More implementation to get nsBrowserWindow and nsWebBrowserChrome implementations geared towards the embedding code. made nsBrowserWindow implement nsIBaseWindow. 2000-03-14 14:08:43 +03:00			`void SetBrowserWindow(nsBrowserWindow* aWindow);`
			`void GetBrowserWindow(nsBrowserWindow** aWindow);`
time to crawl 1998-07-28 02:16:13 +04:00
			`// Set the delay (by default, the timer is set to one second)`
			`void SetDelay(PRInt32 aSeconds) {`
			`mDelay = aSeconds;`
			`}`

support for regression testing. r=rods 2000-01-26 18:14:41 +03:00			`void SetPrintTest(PRInt32 aTestType) { mPrinterTestType = aTestType; }`

sr=waterson r=karnaze. Fixed printing regression tests. 2001-05-04 02:12:35 +04:00			`void RegressionOutput(PRInt32 aRegressionOutputLevel) { mRegressionOutputLevel = aRegressionOutputLevel; }`

time to crawl 1998-07-28 02:16:13 +04:00			`void EnableJiggleLayout() {`
			`mJiggleLayout = PR_TRUE;`
			`}`

			`// If set to TRUE the loader will post an exit message on exit`
			`void SetExitOnDone(PRBool aPostExit) {`
			`mPostExit = aPostExit;`
			`}`

			`// Start loading documents`
			`void Start();`

			`// Enable the crawler; when a document contains links to other`
			`// documents the crawler will go to them subject to the limitations`
			`// on the total crawl count and the domain name checks.`
			`void EnableCrawler();`

			`void SetRecordFile(FILE* aFile) {`
			`mRecord = aFile;`
			`}`

Added max-pages 1998-07-28 03:01:06 +04:00			`void SetMaxPages(PRInt32 aMax) {`
			`mMaxPages = aMax;`
			`}`

nsBrowserWindow.cpp,h added filter name param to DumpFrames nsViewerApp.cpp added -o <outputDirPath> for specifying where output files should go added -filter <filterName> for specifying that <filterName> should be used in an automated fashion for dumping frames for every URL visited. nsWebCrawler.cpp,h added support methods to handle output dirs, output file name building from URL, and filter names added DumpFrames to ::OnStopBinding when the right state is set (from -filter option) added dependancies on: nsIPresShell.h, nsIPresContext.h, and nsIFrame.h 1998-09-03 02:07:42 +04:00			`void SetOutputDir(const nsString& aOutputDir);`

Delay before writing regression. Accept delays in regression file list. sr=buster r=attinasi 2000-12-15 01:27:22 +03:00			`void DumpRegressionData(nsIWebShell* aWebShell,`
			`nsIURI* aURL);`
Added automated regression testing support to the crawler 1998-12-10 21:05:59 +03:00			`void SetRegressionDir(const nsString& aOutputDir);`

			`void SetEnableRegression(PRBool aSetting) {`
			`mRegressing = aSetting;`
			`}`

Removed old netlib code; changed url loading when crawling to use the event queue to fix some re-entrancy issues somewhat 1999-10-15 18:09:19 +04:00			`void LoadNextURL(PRBool aQueueLoad);`

			`nsresult QueueLoadURL(const nsString& aURL);`

			`void GoToQueuedURL(const nsString& aURL);`

			`void QueueExit();`

			`void Exit();`
time to crawl 1998-07-28 02:16:13 +04:00
Fixed up the web-crawler to only trigger crawls on a document being loaded, not on one of it's images; support -o even w/o a filter; use the regression dump for -o instead of list 1998-11-19 20:24:13 +03:00			`void SetVerbose(PRBool aSetting) {`
			`mVerbose = aSetting;`
			`}`

I added logic to the viewers browser window to disable javascript initiated popups during web crawling; I also fixed the web crawler to only initiate a new load after a previous document ends not a subdocument (again!) 1999-10-29 06:06:09 +04:00			`PRBool Crawling() const {`
			`return mCrawl;`
			`}`

			`PRBool LoadingURLList() const {`
			`return mHaveURLList;`
			`}`

Changes to put style data into the layout regression data output. r=karnaze, a=waterson 2000-12-07 18:31:40 +03:00			`void IncludeStyleData(PRBool aIncludeStyle) {`
			`mIncludeStyleInfo = aIncludeStyle;`
			`}`

time to crawl 1998-07-28 02:16:13 +04:00			`protected:`
			`virtual ~nsWebCrawler();`

			`void FindURLsIn(nsIDocument* aDocument, nsIContent* aNode);`

			`void FindMoreURLs();`

			`PRBool OkToLoad(const nsString& aURLSpec);`

			`void RecordLoadedURL(const nsString& aURLSpec);`

nsBrowserWindow.cpp,h added filter name param to DumpFrames nsViewerApp.cpp added -o <outputDirPath> for specifying where output files should go added -filter <filterName> for specifying that <filterName> should be used in an automated fashion for dumping frames for every URL visited. nsWebCrawler.cpp,h added support methods to handle output dirs, output file name building from URL, and filter names added DumpFrames to ::OnStopBinding when the right state is set (from -filter option) added dependancies on: nsIPresShell.h, nsIPresContext.h, and nsIFrame.h 1998-09-03 02:07:42 +04:00			`/** generate an output name from a URL */`
Renamed nsIURL to nsIURI in preperation for necko. More NECKO ifdefs too. 1999-06-23 07:29:44 +04:00			`FILE* GetOutputFile(nsIURI *aURL, nsString& aOutputName);`
nsBrowserWindow.cpp,h added filter name param to DumpFrames nsViewerApp.cpp added -o <outputDirPath> for specifying where output files should go added -filter <filterName> for specifying that <filterName> should be used in an automated fashion for dumping frames for every URL visited. nsWebCrawler.cpp,h added support methods to handle output dirs, output file name building from URL, and filter names added DumpFrames to ::OnStopBinding when the right state is set (from -filter option) added dependancies on: nsIPresShell.h, nsIPresContext.h, and nsIFrame.h 1998-09-03 02:07:42 +04:00
Delay before writing regression. Accept delays in regression file list. sr=buster r=attinasi 2000-12-15 01:27:22 +03:00			`nsIPresShell* GetPresShell(nsIWebShell* aWebShell = nsnull);`
nsBrowserWindow.cpp,h added filter name param to DumpFrames nsViewerApp.cpp added -o <outputDirPath> for specifying where output files should go added -filter <filterName> for specifying that <filterName> should be used in an automated fashion for dumping frames for every URL visited. nsWebCrawler.cpp,h added support methods to handle output dirs, output file name building from URL, and filter names added DumpFrames to ::OnStopBinding when the right state is set (from -filter option) added dependancies on: nsIPresShell.h, nsIPresContext.h, and nsIFrame.h 1998-09-03 02:07:42 +04:00
Added automated regression testing support to the crawler 1998-12-10 21:05:59 +03:00			`void PerformRegressionTest(const nsString& aOutputName);`

Changed nsIBrowserWindow references to reference nsBrowserWindow. nsBrowserWindow is no longer a component. More implementation to get nsBrowserWindow and nsWebBrowserChrome implementations geared towards the embedding code. made nsBrowserWindow implement nsIBaseWindow. 2000-03-14 14:08:43 +03:00			`nsBrowserWindow* mBrowser;`
time to crawl 1998-07-28 02:16:13 +04:00			`nsViewerApp* mViewer;`
changes all usages of NS_NewTimer to CreateInstance 2000-05-17 06:49:35 +04:00			`nsCOMPtr<nsITimer> mTimer;`
time to crawl 1998-07-28 02:16:13 +04:00			`FILE* mRecord;`
r=dp,vidur,beard; close up some memory leaks related to shutdown of the viewer 1999-10-05 03:41:25 +04:00			`nsCOMPtr<nsIAtom> mLinkTag;`
			`nsCOMPtr<nsIAtom> mFrameTag;`
			`nsCOMPtr<nsIAtom> mIFrameTag;`
			`nsCOMPtr<nsIAtom> mHrefAttr;`
			`nsCOMPtr<nsIAtom> mSrcAttr;`
			`nsCOMPtr<nsIAtom> mBaseHrefAttr;`
time to crawl 1998-07-28 02:16:13 +04:00			`AtomHashTable* mVisited;`
Added automated regression testing support to the crawler 1998-12-10 21:05:59 +03:00			`nsString mOutputDir;`
time to crawl 1998-07-28 02:16:13 +04:00
			`PRBool mCrawl;`
I added logic to the viewers browser window to disable javascript initiated popups during web crawling; I also fixed the web crawler to only initiate a new load after a previous document ends not a subdocument (again!) 1999-10-29 06:06:09 +04:00			`PRBool mHaveURLList;`
time to crawl 1998-07-28 02:16:13 +04:00			`PRBool mJiggleLayout;`
			`PRBool mPostExit;`
Delay before writing regression. Accept delays in regression file list. sr=buster r=attinasi 2000-12-15 01:27:22 +03:00			`PRInt32 mDelay; // first delay encountered from command line or delay:= in file`
			`PRInt32 mLastDelay; // last delay encountered from command line or delay:= in file`
Added max-pages 1998-07-28 03:01:06 +04:00			`PRInt32 mMaxPages;`
time to crawl 1998-07-28 02:16:13 +04:00
Fixed up the web-crawler to only trigger crawls on a document being loaded, not on one of it's images; support -o even w/o a filter; use the regression dump for -o instead of list 1998-11-19 20:24:13 +03:00			`nsString mCurrentURL;`
r=waterson, sr=rpotts. part of 15345. removing unused nsIDocLoaderObserver from nsBrowserWindow and making the webcrawler use nsIWebProgressListener instead of nsIDocLoaderObserver which is depricated. 2001-05-03 03:58:42 +04:00			`nsCOMPtr<nsIURI> mLastURL;`
Delay before writing regression. Accept delays in regression file list. sr=buster r=attinasi 2000-12-15 01:27:22 +03:00			`nsIWebShell* mLastWebShell;`

I added logic to the viewers browser window to disable javascript initiated popups during web crawling; I also fixed the web crawler to only initiate a new load after a previous document ends not a subdocument (again!) 1999-10-29 06:06:09 +04:00			`PRTime mStartLoad;`
Fixed up the web-crawler to only trigger crawls on a document being loaded, not on one of it's images; support -o even w/o a filter; use the regression dump for -o instead of list 1998-11-19 20:24:13 +03:00			`PRBool mVerbose;`
Added automated regression testing support to the crawler 1998-12-10 21:05:59 +03:00			`PRBool mRegressing;`
support for regression testing. r=rods 2000-01-26 18:14:41 +03:00			`PRInt32 mPrinterTestType;`
sr=waterson r=karnaze. Fixed printing regression tests. 2001-05-04 02:12:35 +04:00			`PRInt32 mRegressionOutputLevel;`
Added automated regression testing support to the crawler 1998-12-10 21:05:59 +03:00			`nsString mRegressionDir;`
Changes to put style data into the layout regression data output. r=karnaze, a=waterson 2000-12-07 18:31:40 +03:00			`PRBool mIncludeStyleInfo;`
Fixed up the web-crawler to only trigger crawls on a document being loaded, not on one of it's images; support -o even w/o a filter; use the regression dump for -o instead of list 1998-11-19 20:24:13 +03:00
time to crawl 1998-07-28 02:16:13 +04:00			`nsVoidArray mPendingURLs;`
			`nsVoidArray mSafeDomains;`
			`nsVoidArray mAvoidDomains;`
Removed old netlib code; changed url loading when crawling to use the event queue to fix some re-entrancy issues somewhat 1999-10-15 18:09:19 +04:00
			`PRInt32 mQueuedLoadURLs;`
time to crawl 1998-07-28 02:16:13 +04:00			`};`

			`#endif /* nsWebCrawler_h___ */`