1998-05-14 05:08:06 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/*
|
1999-11-06 06:43:54 +03:00
|
|
|
* The contents of this file are subject to the Netscape Public
|
|
|
|
* License Version 1.1 (the "License"); you may not use this file
|
|
|
|
* except in compliance with the License. You may obtain a copy of
|
|
|
|
* the License at http://www.mozilla.org/NPL/
|
1998-05-14 05:08:06 +04:00
|
|
|
*
|
1999-11-06 06:43:54 +03:00
|
|
|
* Software distributed under the License is distributed on an "AS
|
|
|
|
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
|
|
* implied. See the License for the specific language governing
|
|
|
|
* rights and limitations under the License.
|
1998-05-14 05:08:06 +04:00
|
|
|
*
|
1999-11-06 06:43:54 +03:00
|
|
|
* The Original Code is mozilla.org code.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is Netscape
|
1998-05-14 05:08:06 +04:00
|
|
|
* Communications Corporation. Portions created by Netscape are
|
1999-11-06 06:43:54 +03:00
|
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All
|
|
|
|
* Rights Reserved.
|
|
|
|
*
|
|
|
|
* Contributor(s):
|
2000-01-11 23:49:15 +03:00
|
|
|
* Pierre Phaneuf <pp@ludusdesign.com>
|
1998-05-14 05:08:06 +04:00
|
|
|
*/
|
|
|
|
#include "nsIRobotSink.h"
|
|
|
|
#include "nsIRobotSinkObserver.h"
|
|
|
|
#include "nsIParser.h"
|
2000-03-29 10:13:07 +04:00
|
|
|
#include "nsIDocShell.h"
|
2001-05-05 00:15:38 +04:00
|
|
|
#include "nsIWebNavigation.h"
|
|
|
|
#include "nsIWebProgress.h"
|
|
|
|
#include "nsIWebProgressListener.h"
|
|
|
|
#include "nsWeakReference.h"
|
1998-05-14 05:08:06 +04:00
|
|
|
#include "nsVoidArray.h"
|
|
|
|
#include "nsString.h"
|
|
|
|
#include "nsIURL.h"
|
1999-06-18 21:34:08 +04:00
|
|
|
#include "nsIServiceManager.h"
|
1999-06-23 07:29:44 +04:00
|
|
|
#include "nsIURL.h"
|
1999-06-18 21:34:08 +04:00
|
|
|
#include "nsIIOService.h"
|
2001-04-10 10:01:08 +04:00
|
|
|
#include "nsNetCID.h"
|
1999-03-09 12:44:27 +03:00
|
|
|
#include "nsIComponentManager.h"
|
1998-07-31 02:42:27 +04:00
|
|
|
#include "nsParserCIID.h"
|
2001-05-05 00:15:38 +04:00
|
|
|
#include "nsIInterfaceRequestor.h"
|
1998-05-14 05:08:06 +04:00
|
|
|
|
2001-04-10 10:01:08 +04:00
|
|
|
static NS_DEFINE_CID(kIOServiceCID, NS_IOSERVICE_CID);
|
1998-05-14 05:08:06 +04:00
|
|
|
static NS_DEFINE_IID(kIRobotSinkObserverIID, NS_IROBOTSINKOBSERVER_IID);
|
|
|
|
|
|
|
|
class RobotSinkObserver : public nsIRobotSinkObserver {
|
|
|
|
public:
|
|
|
|
RobotSinkObserver() {
|
|
|
|
NS_INIT_REFCNT();
|
|
|
|
}
|
|
|
|
|
1999-05-13 06:25:13 +04:00
|
|
|
virtual ~RobotSinkObserver() {
|
1998-05-14 05:08:06 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
|
|
|
|
NS_IMETHOD ProcessLink(const nsString& aURLSpec);
|
1998-05-19 23:06:59 +04:00
|
|
|
NS_IMETHOD VerifyDirectory (const char * verify_dir);
|
1998-05-15 01:47:33 +04:00
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
};
|
|
|
|
|
1998-05-15 01:47:33 +04:00
|
|
|
static nsVoidArray * g_workList;
|
|
|
|
static nsVoidArray * g_duplicateList;
|
|
|
|
static int g_iProcessed;
|
|
|
|
static int g_iMaxProcess = 5000;
|
|
|
|
static PRBool g_bHitTop;
|
1998-05-28 04:21:34 +04:00
|
|
|
static PRBool g_bReadyForNextUrl;
|
1998-05-15 01:47:33 +04:00
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
NS_IMPL_ISUPPORTS(RobotSinkObserver, kIRobotSinkObserverIID);
|
|
|
|
|
1998-05-19 23:06:59 +04:00
|
|
|
NS_IMETHODIMP RobotSinkObserver::VerifyDirectory(const char * verify_dir)
|
|
|
|
{
|
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
NS_IMETHODIMP RobotSinkObserver::ProcessLink(const nsString& aURLSpec)
|
|
|
|
{
|
1998-05-15 01:47:33 +04:00
|
|
|
if (!g_bHitTop) {
|
|
|
|
|
1998-06-04 05:19:25 +04:00
|
|
|
nsAutoString str;
|
1998-06-12 05:35:36 +04:00
|
|
|
// Geez this is ugly. temporary hack to only process html files
|
1998-06-04 05:19:25 +04:00
|
|
|
str.Truncate();
|
|
|
|
nsString(aURLSpec).Right(str,1);
|
2000-04-03 12:04:52 +04:00
|
|
|
if (!str.EqualsWithConversion("/"))
|
1998-06-04 05:19:25 +04:00
|
|
|
{
|
|
|
|
str.Truncate();
|
|
|
|
nsString(aURLSpec).Right(str,4);
|
2000-04-03 12:04:52 +04:00
|
|
|
if (!str.EqualsWithConversion("html"))
|
1998-06-04 05:19:25 +04:00
|
|
|
{
|
|
|
|
str.Truncate();
|
|
|
|
nsString(aURLSpec).Right(str,3);
|
2000-04-03 12:04:52 +04:00
|
|
|
if (!str.EqualsWithConversion("htm"))
|
1998-06-04 05:19:25 +04:00
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
}
|
1998-05-15 01:47:33 +04:00
|
|
|
PRInt32 nCount = g_duplicateList->Count();
|
|
|
|
if (nCount > 0)
|
|
|
|
{
|
|
|
|
for (PRInt32 n = 0; n < nCount; n++)
|
|
|
|
{
|
|
|
|
nsString * pstr = (nsString *)g_duplicateList->ElementAt(n);
|
|
|
|
if (pstr->Equals(aURLSpec)) {
|
1998-06-12 05:35:36 +04:00
|
|
|
fputs ("Robot: (duplicate '",stdout);
|
1998-05-15 01:47:33 +04:00
|
|
|
fputs (aURLSpec,stdout);
|
|
|
|
fputs ("')\n",stdout);
|
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
g_duplicateList->AppendElement(new nsString(aURLSpec));
|
|
|
|
str.Truncate();
|
|
|
|
nsString(aURLSpec).Left(str,5);
|
2000-04-03 12:04:52 +04:00
|
|
|
if (str.EqualsWithConversion("http:")) {
|
1998-05-15 01:47:33 +04:00
|
|
|
g_iProcessed++;
|
1998-06-12 05:35:36 +04:00
|
|
|
if (g_iProcessed == (g_iMaxProcess > 0 ? g_iMaxProcess-1 : 0))
|
1998-05-15 01:47:33 +04:00
|
|
|
g_bHitTop = PR_TRUE;
|
|
|
|
g_workList->AppendElement(new nsString(aURLSpec));
|
|
|
|
}
|
|
|
|
else {
|
1998-06-12 05:35:36 +04:00
|
|
|
fputs ("Robot: (cannot process URL types '",stdout);
|
1998-05-15 01:47:33 +04:00
|
|
|
fputs (aURLSpec,stdout);
|
|
|
|
fputs ("')\n",stdout);
|
|
|
|
}
|
|
|
|
}
|
1998-05-14 05:08:06 +04:00
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
|
1998-05-19 23:06:59 +04:00
|
|
|
extern "C" NS_EXPORT void SetVerificationDirectory(char * verify_dir);
|
1998-05-14 05:08:06 +04:00
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
class CStreamListener: public nsIWebProgressListener,
|
|
|
|
public nsSupportsWeakReference
|
1998-06-04 05:19:25 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
CStreamListener() {
|
|
|
|
NS_INIT_REFCNT();
|
1998-07-01 15:11:44 +04:00
|
|
|
|
1998-06-04 05:19:25 +04:00
|
|
|
}
|
|
|
|
|
1999-05-13 06:25:13 +04:00
|
|
|
virtual ~CStreamListener() {
|
1998-06-04 05:19:25 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
// nsIWebProgressListener
|
|
|
|
NS_DECL_NSIWEBPROGRESSLISTENER
|
1998-06-04 05:19:25 +04:00
|
|
|
};
|
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
// nsIWebProgressListener implementation
|
1999-10-05 08:04:16 +04:00
|
|
|
NS_IMETHODIMP
|
2001-05-05 00:15:38 +04:00
|
|
|
CStreamListener::OnStateChange(nsIWebProgress* aWebProgress,
|
|
|
|
nsIRequest *aRequest,
|
|
|
|
PRInt32 progressStateFlags,
|
|
|
|
nsresult aStatus) {
|
|
|
|
if (progressStateFlags & nsIWebProgressListener::STATE_IS_DOCUMENT)
|
|
|
|
if (progressStateFlags & nsIWebProgressListener::STATE_STOP) {
|
|
|
|
fputs("done.\n",stdout);
|
|
|
|
g_bReadyForNextUrl = PR_TRUE;
|
|
|
|
}
|
|
|
|
return NS_OK;
|
1999-09-16 05:16:22 +04:00
|
|
|
}
|
|
|
|
|
1999-10-05 08:04:16 +04:00
|
|
|
NS_IMETHODIMP
|
2001-05-05 00:15:38 +04:00
|
|
|
CStreamListener::OnProgressChange(nsIWebProgress *aWebProgress,
|
|
|
|
nsIRequest *aRequest,
|
|
|
|
PRInt32 aCurSelfProgress,
|
|
|
|
PRInt32 aMaxSelfProgress,
|
|
|
|
PRInt32 aCurTotalProgress,
|
|
|
|
PRInt32 aMaxTotalProgress) {
|
|
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
1999-10-05 08:04:16 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
2001-05-05 00:15:38 +04:00
|
|
|
CStreamListener::OnLocationChange(nsIWebProgress* aWebProgress,
|
|
|
|
nsIRequest* aRequest,
|
|
|
|
nsIURI *location) {
|
|
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
1999-10-05 08:04:16 +04:00
|
|
|
}
|
|
|
|
|
1998-06-04 05:19:25 +04:00
|
|
|
|
1999-10-05 08:04:16 +04:00
|
|
|
NS_IMETHODIMP
|
2001-05-05 00:15:38 +04:00
|
|
|
CStreamListener::OnStatusChange(nsIWebProgress* aWebProgress,
|
|
|
|
nsIRequest* aRequest,
|
|
|
|
nsresult aStatus,
|
|
|
|
const PRUnichar* aMessage) {
|
|
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
1999-10-05 08:04:16 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
NS_IMETHODIMP
|
|
|
|
CStreamListener::OnSecurityChange(nsIWebProgress *aWebProgress,
|
|
|
|
nsIRequest *aRequest,
|
|
|
|
PRInt32 state) {
|
|
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
1998-06-04 05:19:25 +04:00
|
|
|
}
|
|
|
|
|
2001-05-05 00:15:38 +04:00
|
|
|
NS_IMPL_ISUPPORTS2(CStreamListener,
|
|
|
|
nsIWebProgressListener,
|
|
|
|
nsISupportsWeakReference)
|
1998-06-04 05:19:25 +04:00
|
|
|
|
1998-06-12 05:35:36 +04:00
|
|
|
extern "C" NS_EXPORT void DumpVectorRecord(void);
|
1998-05-19 23:06:59 +04:00
|
|
|
//----------------------------------------------------------------------
|
|
|
|
extern "C" NS_EXPORT int DebugRobot(
|
|
|
|
nsVoidArray * workList,
|
2000-03-29 10:13:07 +04:00
|
|
|
nsIDocShell * docShell,
|
1998-05-19 23:06:59 +04:00
|
|
|
int iMaxLoads,
|
|
|
|
char * verify_dir,
|
|
|
|
void (*yieldProc )(const char *)
|
|
|
|
)
|
1998-05-14 05:08:06 +04:00
|
|
|
{
|
1998-06-12 05:35:36 +04:00
|
|
|
int iCount = 1;
|
1998-06-04 05:19:25 +04:00
|
|
|
CStreamListener * pl = new CStreamListener;
|
|
|
|
NS_ADDREF(pl);
|
1998-07-01 15:11:44 +04:00
|
|
|
|
1998-05-15 01:47:33 +04:00
|
|
|
if (nsnull==workList)
|
|
|
|
return -1;
|
1998-05-19 23:06:59 +04:00
|
|
|
g_iMaxProcess = iMaxLoads;
|
1998-05-15 01:47:33 +04:00
|
|
|
g_iProcessed = 0;
|
|
|
|
g_bHitTop = PR_FALSE;
|
|
|
|
g_duplicateList = new nsVoidArray();
|
1998-05-14 05:08:06 +04:00
|
|
|
RobotSinkObserver* myObserver = new RobotSinkObserver();
|
|
|
|
NS_ADDREF(myObserver);
|
1998-05-15 01:47:33 +04:00
|
|
|
g_workList = workList;
|
1998-05-14 05:08:06 +04:00
|
|
|
|
1998-07-30 03:43:20 +04:00
|
|
|
/*
|
1998-06-25 05:42:50 +04:00
|
|
|
nsIDTDDebug * pIDTDDebug;
|
|
|
|
nsresult rval = NS_NewDTDDebug(&pIDTDDebug);
|
1998-06-19 03:26:55 +04:00
|
|
|
if (NS_OK != rval) {
|
|
|
|
fputs("Cannot create parser debugger.\n", stdout);
|
|
|
|
NS_RELEASE(myObserver);
|
|
|
|
return -1;
|
|
|
|
}
|
1998-06-25 05:42:50 +04:00
|
|
|
pIDTDDebug->SetVerificationDirectory(verify_dir);
|
1998-07-30 03:43:20 +04:00
|
|
|
*/
|
1998-06-19 03:26:55 +04:00
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
for (;;) {
|
1998-05-15 01:47:33 +04:00
|
|
|
PRInt32 n = g_workList->Count();
|
1998-05-14 05:08:06 +04:00
|
|
|
if (0 == n) {
|
|
|
|
break;
|
|
|
|
}
|
1998-05-15 01:47:33 +04:00
|
|
|
nsString* urlName = (nsString*) g_workList->ElementAt(n - 1);
|
|
|
|
g_workList->RemoveElementAt(n - 1);
|
1998-05-14 05:08:06 +04:00
|
|
|
|
|
|
|
// Create url
|
1999-06-23 07:29:44 +04:00
|
|
|
nsIURI* url;
|
1999-06-18 21:34:08 +04:00
|
|
|
nsresult rv;
|
2001-07-25 11:54:28 +04:00
|
|
|
nsCOMPtr<nsIIOService> service(do_GetService(kIOServiceCID, &rv));
|
1999-06-18 21:34:08 +04:00
|
|
|
if (NS_FAILED(rv)) return rv;
|
|
|
|
|
|
|
|
nsIURI *uri = nsnull;
|
1999-07-07 18:38:39 +04:00
|
|
|
char *uriStr = urlName->ToNewCString();
|
|
|
|
if (!uriStr) return NS_ERROR_OUT_OF_MEMORY;
|
1999-06-18 21:34:08 +04:00
|
|
|
rv = service->NewURI(uriStr, nsnull, &uri);
|
1999-07-07 18:38:39 +04:00
|
|
|
nsCRT::free(uriStr);
|
1999-06-18 21:34:08 +04:00
|
|
|
if (NS_FAILED(rv)) return rv;
|
|
|
|
|
2000-01-11 23:49:15 +03:00
|
|
|
rv = uri->QueryInterface(NS_GET_IID(nsIURI), (void**)&url);
|
1999-06-18 21:34:08 +04:00
|
|
|
NS_RELEASE(uri);
|
1998-05-14 05:08:06 +04:00
|
|
|
if (NS_OK != rv) {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("invalid URL: '");
|
1998-05-14 05:08:06 +04:00
|
|
|
fputs(*urlName, stdout);
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("'\n");
|
1998-06-19 03:26:55 +04:00
|
|
|
NS_RELEASE(myObserver);
|
1998-05-14 05:08:06 +04:00
|
|
|
return -1;
|
|
|
|
}
|
1998-06-12 05:35:36 +04:00
|
|
|
|
|
|
|
char str_num[25];
|
|
|
|
sprintf (str_num,"%d",iCount++);
|
|
|
|
fputs ("Robot: parsing(",stdout);
|
|
|
|
fputs (str_num,stdout);
|
|
|
|
fputs (") ",stdout);
|
|
|
|
fputs (*urlName,stdout);
|
|
|
|
fputs ("...",stdout);
|
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
delete urlName;
|
|
|
|
|
|
|
|
nsIParser* parser;
|
1998-07-31 02:42:27 +04:00
|
|
|
|
|
|
|
static NS_DEFINE_IID(kCParserIID, NS_IPARSER_IID);
|
2001-03-11 00:02:12 +03:00
|
|
|
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
|
1998-07-31 02:42:27 +04:00
|
|
|
|
1999-03-09 12:44:27 +03:00
|
|
|
rv = nsComponentManager::CreateInstance(kCParserCID,
|
1998-07-31 02:42:27 +04:00
|
|
|
nsnull,
|
|
|
|
kCParserIID,
|
|
|
|
(void **)&parser);
|
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
if (NS_OK != rv) {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("can't make parser\n");
|
1998-06-19 03:26:55 +04:00
|
|
|
NS_RELEASE(myObserver);
|
1998-05-14 05:08:06 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
nsIRobotSink* sink;
|
|
|
|
rv = NS_NewRobotSink(&sink);
|
|
|
|
if (NS_OK != rv) {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("can't make parser\n");
|
1998-06-19 03:26:55 +04:00
|
|
|
NS_RELEASE(myObserver);
|
1998-05-14 05:08:06 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
sink->Init(url);
|
|
|
|
sink->AddObserver(myObserver);
|
|
|
|
|
|
|
|
parser->SetContentSink(sink);
|
1998-11-26 05:55:59 +03:00
|
|
|
g_bReadyForNextUrl = PR_FALSE;
|
1998-06-25 05:42:50 +04:00
|
|
|
|
1999-10-05 08:04:16 +04:00
|
|
|
parser->Parse(url, nsnull,PR_TRUE);/* XXX hook up stream listener here! */
|
1998-05-28 04:21:34 +04:00
|
|
|
while (!g_bReadyForNextUrl) {
|
1998-12-16 08:40:20 +03:00
|
|
|
if (yieldProc != NULL) {
|
1999-06-23 07:29:44 +04:00
|
|
|
char* spec;
|
|
|
|
(void)url->GetSpec(&spec);
|
|
|
|
(*yieldProc)(spec);
|
|
|
|
nsCRT::free(spec);
|
1998-12-16 08:40:20 +03:00
|
|
|
}
|
1998-05-28 04:21:34 +04:00
|
|
|
}
|
1998-06-04 05:19:25 +04:00
|
|
|
g_bReadyForNextUrl = PR_FALSE;
|
2000-03-29 10:13:07 +04:00
|
|
|
if (docShell) {
|
2001-05-05 00:15:38 +04:00
|
|
|
nsCOMPtr<nsIWebProgress> progress(do_GetInterface(docShell, &rv));
|
|
|
|
if (NS_FAILED(rv)) return rv;
|
|
|
|
|
|
|
|
(void) progress->AddProgressListener(pl);
|
1999-10-05 08:04:16 +04:00
|
|
|
|
1999-06-23 07:29:44 +04:00
|
|
|
char* spec;
|
|
|
|
(void)url->GetSpec(&spec);
|
2000-04-03 12:04:52 +04:00
|
|
|
nsAutoString theSpec; theSpec.AssignWithConversion(spec);
|
1999-06-23 07:29:44 +04:00
|
|
|
nsCRT::free(spec);
|
2000-03-29 10:13:07 +04:00
|
|
|
nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(docShell));
|
2001-06-30 15:02:25 +04:00
|
|
|
webNav->LoadURI(theSpec.get(), nsIWebNavigation::LOAD_FLAGS_NONE);/* XXX hook up stream listener here! */
|
1998-06-04 05:19:25 +04:00
|
|
|
while (!g_bReadyForNextUrl) {
|
1998-12-16 08:40:20 +03:00
|
|
|
if (yieldProc != NULL) {
|
1999-06-23 07:29:44 +04:00
|
|
|
(void)url->GetSpec(&spec);
|
|
|
|
(*yieldProc)(spec);
|
|
|
|
nsCRT::free(spec);
|
1998-12-16 08:40:20 +03:00
|
|
|
}
|
1998-06-04 05:19:25 +04:00
|
|
|
}
|
|
|
|
}
|
1998-05-28 04:21:34 +04:00
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
NS_RELEASE(sink);
|
|
|
|
NS_RELEASE(parser);
|
|
|
|
NS_RELEASE(url);
|
|
|
|
}
|
|
|
|
|
1998-06-12 05:35:36 +04:00
|
|
|
fputs ("Robot completed.\n", stdout);
|
|
|
|
|
1998-06-04 05:19:25 +04:00
|
|
|
NS_RELEASE(pl);
|
1998-05-14 05:08:06 +04:00
|
|
|
NS_RELEASE(myObserver);
|
|
|
|
|
1998-07-30 03:43:20 +04:00
|
|
|
// pIDTDDebug->DumpVectorRecord();
|
|
|
|
//NS_RELEASE(pIDTDDebug);
|
1998-06-12 05:35:36 +04:00
|
|
|
|
1998-05-14 05:08:06 +04:00
|
|
|
return 0;
|
|
|
|
}
|