2001-09-29 00:14:13 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
|
|
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
1998-07-28 02:16:13 +04:00
|
|
|
*
|
2001-09-29 00:14:13 +04:00
|
|
|
* The contents of this file are subject to the Netscape Public License
|
|
|
|
* Version 1.1 (the "License"); you may not use this file except in
|
|
|
|
* compliance with the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/NPL/
|
1998-07-28 02:16:13 +04:00
|
|
|
*
|
2001-09-29 00:14:13 +04:00
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* License.
|
1998-07-28 02:16:13 +04:00
|
|
|
*
|
|
|
|
* The Original Code is Mozilla Communicator client code.
|
|
|
|
*
|
2001-09-29 00:14:13 +04:00
|
|
|
* The Initial Developer of the Original Code is
|
|
|
|
* Netscape Communications Corporation.
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
|
|
* the Initial Developer. All Rights Reserved.
|
1999-11-06 06:43:54 +03:00
|
|
|
*
|
2001-09-29 00:14:13 +04:00
|
|
|
* Contributor(s):
|
|
|
|
*
|
|
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
|
|
* use your version of this file under the terms of the NPL, indicate your
|
|
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
|
|
* the provisions above, a recipient may use your version of this file under
|
|
|
|
* the terms of any one of the NPL, the GPL or the LGPL.
|
|
|
|
*
|
|
|
|
* ***** END LICENSE BLOCK ***** */
|
1998-07-28 02:16:13 +04:00
|
|
|
#ifndef nsWebCrawler_h___
|
|
|
|
#define nsWebCrawler_h___
|
|
|
|
|
1999-10-05 03:41:25 +04:00
|
|
|
#include "nsCOMPtr.h"
|
2000-03-14 14:08:43 +03:00
|
|
|
#include "nsBrowserWindow.h"
|
2001-05-03 03:58:42 +04:00
|
|
|
#include "nsIWebProgressListener.h"
|
1998-07-28 02:16:13 +04:00
|
|
|
#include "nsVoidArray.h"
|
|
|
|
#include "nsString.h"
|
1999-10-05 03:41:25 +04:00
|
|
|
#include "nsIAtom.h"
|
2001-05-03 03:58:42 +04:00
|
|
|
#include "nsWeakReference.h"
|
2001-07-16 06:40:48 +04:00
|
|
|
#include "nsIURI.h"
|
2001-05-03 03:58:42 +04:00
|
|
|
|
1998-07-28 02:16:13 +04:00
|
|
|
|
|
|
|
class nsIContent;
|
|
|
|
class nsIDocument;
|
|
|
|
class nsITimer;
|
1999-06-23 07:29:44 +04:00
|
|
|
class nsIURI;
|
1998-09-03 02:07:42 +04:00
|
|
|
class nsIPresShell;
|
1998-07-28 02:16:13 +04:00
|
|
|
class nsViewerApp;
|
|
|
|
class AtomHashTable;
|
|
|
|
|
2001-05-03 03:58:42 +04:00
|
|
|
class nsWebCrawler : public nsIWebProgressListener,
|
|
|
|
public nsSupportsWeakReference {
|
1998-07-28 02:16:13 +04:00
|
|
|
public:
|
1999-02-08 20:57:00 +03:00
|
|
|
// Make a new web-crawler for the given viewer. Note: the web
|
|
|
|
// crawler does not addref the viewer.
|
1998-07-28 02:16:13 +04:00
|
|
|
nsWebCrawler(nsViewerApp* aViewer);
|
|
|
|
|
|
|
|
// nsISupports
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
// nsIWebProgressListener
|
2001-05-03 03:58:42 +04:00
|
|
|
NS_DECL_NSIWEBPROGRESSLISTENER
|
1998-07-28 02:16:13 +04:00
|
|
|
|
|
|
|
// Add a url to load
|
|
|
|
void AddURL(const nsString& aURL);
|
|
|
|
|
|
|
|
// Add a domain that is safe to load url's from
|
|
|
|
void AddSafeDomain(const nsString& aDomain);
|
|
|
|
|
|
|
|
// Add a domain that must be avoided
|
|
|
|
void AddAvoidDomain(const nsString& aDomain);
|
|
|
|
|
2000-03-14 14:08:43 +03:00
|
|
|
void SetBrowserWindow(nsBrowserWindow* aWindow);
|
|
|
|
void GetBrowserWindow(nsBrowserWindow** aWindow);
|
1998-07-28 02:16:13 +04:00
|
|
|
|
2000-01-26 18:14:41 +03:00
|
|
|
void SetPrintTest(PRInt32 aTestType) { mPrinterTestType = aTestType; }
|
|
|
|
|
2001-05-04 02:12:35 +04:00
|
|
|
void RegressionOutput(PRInt32 aRegressionOutputLevel) { mRegressionOutputLevel = aRegressionOutputLevel; }
|
|
|
|
|
1998-07-28 02:16:13 +04:00
|
|
|
void EnableJiggleLayout() {
|
|
|
|
mJiggleLayout = PR_TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If set to TRUE the loader will post an exit message on exit
|
|
|
|
void SetExitOnDone(PRBool aPostExit) {
|
|
|
|
mPostExit = aPostExit;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start loading documents
|
|
|
|
void Start();
|
|
|
|
|
|
|
|
// Enable the crawler; when a document contains links to other
|
|
|
|
// documents the crawler will go to them subject to the limitations
|
|
|
|
// on the total crawl count and the domain name checks.
|
|
|
|
void EnableCrawler();
|
|
|
|
|
|
|
|
void SetRecordFile(FILE* aFile) {
|
|
|
|
mRecord = aFile;
|
|
|
|
}
|
|
|
|
|
1998-07-28 03:01:06 +04:00
|
|
|
void SetMaxPages(PRInt32 aMax) {
|
|
|
|
mMaxPages = aMax;
|
|
|
|
}
|
|
|
|
|
1998-09-03 02:07:42 +04:00
|
|
|
void SetOutputDir(const nsString& aOutputDir);
|
|
|
|
|
2001-05-11 06:17:35 +04:00
|
|
|
void DumpRegressionData();
|
1998-12-10 21:05:59 +03:00
|
|
|
void SetRegressionDir(const nsString& aOutputDir);
|
|
|
|
|
|
|
|
void SetEnableRegression(PRBool aSetting) {
|
|
|
|
mRegressing = aSetting;
|
|
|
|
}
|
|
|
|
|
2001-05-11 06:17:35 +04:00
|
|
|
static void
|
|
|
|
LoadNextURLCallback(nsITimer* aTimer, void* aClosure);
|
|
|
|
|
1999-10-15 18:09:19 +04:00
|
|
|
void LoadNextURL(PRBool aQueueLoad);
|
|
|
|
|
|
|
|
nsresult QueueLoadURL(const nsString& aURL);
|
|
|
|
|
|
|
|
void GoToQueuedURL(const nsString& aURL);
|
|
|
|
|
2001-05-11 06:17:35 +04:00
|
|
|
static void
|
|
|
|
QueueExitCallback(nsITimer* atimer, void* aClosure);
|
|
|
|
|
1999-10-15 18:09:19 +04:00
|
|
|
void QueueExit();
|
|
|
|
|
|
|
|
void Exit();
|
1998-07-28 02:16:13 +04:00
|
|
|
|
1998-11-19 20:24:13 +03:00
|
|
|
void SetVerbose(PRBool aSetting) {
|
|
|
|
mVerbose = aSetting;
|
|
|
|
}
|
|
|
|
|
1999-10-29 06:06:09 +04:00
|
|
|
PRBool Crawling() const {
|
|
|
|
return mCrawl;
|
|
|
|
}
|
|
|
|
|
|
|
|
PRBool LoadingURLList() const {
|
|
|
|
return mHaveURLList;
|
|
|
|
}
|
|
|
|
|
2000-12-07 18:31:40 +03:00
|
|
|
void IncludeStyleData(PRBool aIncludeStyle) {
|
|
|
|
mIncludeStyleInfo = aIncludeStyle;
|
|
|
|
}
|
|
|
|
|
1998-07-28 02:16:13 +04:00
|
|
|
protected:
|
|
|
|
virtual ~nsWebCrawler();
|
|
|
|
|
|
|
|
void FindURLsIn(nsIDocument* aDocument, nsIContent* aNode);
|
|
|
|
|
|
|
|
void FindMoreURLs();
|
|
|
|
|
|
|
|
PRBool OkToLoad(const nsString& aURLSpec);
|
|
|
|
|
|
|
|
void RecordLoadedURL(const nsString& aURLSpec);
|
|
|
|
|
1998-09-03 02:07:42 +04:00
|
|
|
/** generate an output name from a URL */
|
1999-06-23 07:29:44 +04:00
|
|
|
FILE* GetOutputFile(nsIURI *aURL, nsString& aOutputName);
|
1998-09-03 02:07:42 +04:00
|
|
|
|
2000-12-15 01:27:22 +03:00
|
|
|
nsIPresShell* GetPresShell(nsIWebShell* aWebShell = nsnull);
|
1998-09-03 02:07:42 +04:00
|
|
|
|
1998-12-10 21:05:59 +03:00
|
|
|
void PerformRegressionTest(const nsString& aOutputName);
|
|
|
|
|
2000-03-14 14:08:43 +03:00
|
|
|
nsBrowserWindow* mBrowser;
|
1998-07-28 02:16:13 +04:00
|
|
|
nsViewerApp* mViewer;
|
2000-05-17 06:49:35 +04:00
|
|
|
nsCOMPtr<nsITimer> mTimer;
|
1998-07-28 02:16:13 +04:00
|
|
|
FILE* mRecord;
|
1999-10-05 03:41:25 +04:00
|
|
|
nsCOMPtr<nsIAtom> mLinkTag;
|
|
|
|
nsCOMPtr<nsIAtom> mFrameTag;
|
|
|
|
nsCOMPtr<nsIAtom> mIFrameTag;
|
|
|
|
nsCOMPtr<nsIAtom> mHrefAttr;
|
|
|
|
nsCOMPtr<nsIAtom> mSrcAttr;
|
|
|
|
nsCOMPtr<nsIAtom> mBaseHrefAttr;
|
1998-07-28 02:16:13 +04:00
|
|
|
AtomHashTable* mVisited;
|
1998-12-10 21:05:59 +03:00
|
|
|
nsString mOutputDir;
|
1998-07-28 02:16:13 +04:00
|
|
|
|
|
|
|
PRBool mCrawl;
|
1999-10-29 06:06:09 +04:00
|
|
|
PRBool mHaveURLList;
|
1998-07-28 02:16:13 +04:00
|
|
|
PRBool mJiggleLayout;
|
|
|
|
PRBool mPostExit;
|
2001-05-11 06:17:35 +04:00
|
|
|
PRInt32 mDelay;
|
1998-07-28 03:01:06 +04:00
|
|
|
PRInt32 mMaxPages;
|
1998-07-28 02:16:13 +04:00
|
|
|
|
1998-11-19 20:24:13 +03:00
|
|
|
nsString mCurrentURL;
|
2001-05-03 03:58:42 +04:00
|
|
|
nsCOMPtr<nsIURI> mLastURL;
|
2000-12-15 01:27:22 +03:00
|
|
|
|
1999-10-29 06:06:09 +04:00
|
|
|
PRTime mStartLoad;
|
1998-11-19 20:24:13 +03:00
|
|
|
PRBool mVerbose;
|
1998-12-10 21:05:59 +03:00
|
|
|
PRBool mRegressing;
|
2000-01-26 18:14:41 +03:00
|
|
|
PRInt32 mPrinterTestType;
|
2001-05-04 02:12:35 +04:00
|
|
|
PRInt32 mRegressionOutputLevel;
|
1998-12-10 21:05:59 +03:00
|
|
|
nsString mRegressionDir;
|
2000-12-07 18:31:40 +03:00
|
|
|
PRBool mIncludeStyleInfo;
|
1998-11-19 20:24:13 +03:00
|
|
|
|
1998-07-28 02:16:13 +04:00
|
|
|
nsVoidArray mPendingURLs;
|
|
|
|
nsVoidArray mSafeDomains;
|
|
|
|
nsVoidArray mAvoidDomains;
|
1999-10-15 18:09:19 +04:00
|
|
|
|
|
|
|
PRInt32 mQueuedLoadURLs;
|
1998-07-28 02:16:13 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* nsWebCrawler_h___ */
|