180 строки
6.8 KiB
C
180 строки
6.8 KiB
C
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
*
|
|
* The contents of this file are subject to the Netscape Public License
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
* http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
* for the specific language governing rights and limitations under the
|
|
* NPL.
|
|
*
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
* Reserved.
|
|
*/
|
|
/*** crawler.h ****************************************************/
|
|
/* description: html crawler */
|
|
|
|
|
|
/********************************************************************
|
|
|
|
The crawler scans html pages and the links in those pages to a specified
|
|
depth in a breadth first manner, optionally caching them in an external cache.
|
|
Items are crawled sequentially - an item must finish being crawled or cached
|
|
before the next item is crawled. Multiple instances of the crawler may be running
|
|
at the same time.
|
|
|
|
Depth = 1 means that only the initial page, and any images and resources that
|
|
it contains, are cached.
|
|
|
|
Depth = n means that the crawler will crawl to n levels deep. Assuming the
|
|
maximum cache size is sufficient, the crawler will cache images and resources
|
|
for each page it encounters. Normally, resources are cached after all the pages
|
|
for a given level have been processed, but some resources are considered "required",
|
|
meaning they will be cached immediately after the page containing them has been
|
|
processed. An example of a "required" resource is a stylesheet.
|
|
|
|
The crawler obeys the robots.txt directives on a site, which may allow or deny access
|
|
to specific urls or directories. The robots.txt file is by convention at the top level
|
|
of a site.
|
|
|
|
The type of links that are crawled are determined in pagescan.c.
|
|
The parsing code is in htmparse.c
|
|
The robots.txt parser is in robotxt.c
|
|
|
|
$Revision: 1.1 $
|
|
$Date: 1998-04-30 20:53:21 $
|
|
|
|
*********************************************************************/
|
|
|
|
#ifndef crawler_h___
|
|
#define crawler_h___
|
|
|
|
#include "ntypes.h" /* for MWContext */
|
|
#include "prtypes.h" /* for PRBool */
|
|
#include "net.h" /* for ExtCacheDBInfo, URL_Struct */
|
|
|
|
/* Error codes */
|
|
typedef PRUint16 CRAWL_Error;
|
|
|
|
#define CRAWL_CACHE_FULL ((CRAWL_Error)0x0001)
|
|
#define CRAWL_NO_MEMORY ((CRAWL_Error)0x0002)
|
|
#define CRAWL_SERVER_ERR ((CRAWL_Error)0x0004)
|
|
#define CRAWL_INTERRUPTED ((CRAWL_Error)0x0008)
|
|
|
|
/* these error codes indicate if and how the cache has been updated and are only
|
|
set if CRAWL_MakeCrawler was called with manageCache set to true. Note that replaced
|
|
links may not have been reported as such if the server does not provide a last
|
|
modified date.
|
|
*/
|
|
#define CRAWL_NEW_LINK ((CRAWL_Error)0x0010)
|
|
#define CRAWL_REPLACED_LINK ((CRAWL_Error)0x0020)
|
|
#define CRAWL_REMOVED_LINK ((CRAWL_Error)0x0040)
|
|
|
|
/* Most of the APIs require a reference to CRAWL_Crawler, which is created by CRAWL_MakeCrawler. */
|
|
typedef struct _CRAWL_CrawlerStruct *CRAWL_Crawler;
|
|
|
|
/*
|
|
* Typedef for a callback executed when an item has been processed.
|
|
*/
|
|
typedef void
|
|
(PR_CALLBACK *CRAWL_PostProcessItemFn)(CRAWL_Crawler crawler, URL_Struct *url_s, PRBool isCached, void *data);
|
|
|
|
/*
|
|
* Typedef for a callback executed when the crawler is done.
|
|
*/
|
|
typedef void
|
|
(PR_CALLBACK *CRAWL_ExitFn)(CRAWL_Crawler crawler, void *data);
|
|
|
|
/****************************************************************************************/
|
|
/* public API */
|
|
/****************************************************************************************/
|
|
|
|
NSPR_BEGIN_EXTERN_C
|
|
|
|
/*
|
|
Creates a crawler which may be used for one crawling request. Subsequent requests
|
|
to crawl urls should use a separate crawler instance. Returns NULL if not enough
|
|
memory is available, or the depth is less than 1.
|
|
|
|
Parameters:
|
|
context - needed by netlib (the crawler does not check this parameter)
|
|
siteName - url of the site
|
|
stayInSite - whether to restrict crawling to the site named.
|
|
manageCache - whether to maintain a local file describing the cache contents.
|
|
If true, the crawler uses the file to remove dangling links from the cache
|
|
the next time it is invoked with the same cache. This is not guaranteed to
|
|
work correctly if another crawling instance uses the same cache simultaneously.
|
|
cache - the external cache. This may be NULL if the crawled items do not need
|
|
to be put in an external cache.
|
|
postProcessItemFn - a function which is called after each item has been handled
|
|
by netlib. This may be NULL.
|
|
postProcessItemData - this data is supplied as a parameter to the postProcessItemFn
|
|
and is opaque to the crawler. This may be NULL.
|
|
exitFn - a function which is called when the crawler is done or has terminated
|
|
prematurely (because the cache is full, or no memory is available). This may be NULL.
|
|
exitData - this data is supplied as a parameter to the exitFn and is opaque to
|
|
the crawler. This may be NULL.
|
|
*/
|
|
PR_EXTERN(CRAWL_Crawler)
|
|
CRAWL_MakeCrawler(MWContext *context,
|
|
char *siteName,
|
|
uint8 depth,
|
|
PRBool stayInSite,
|
|
PRBool manageCache,
|
|
ExtCacheDBInfo *cache,
|
|
CRAWL_PostProcessItemFn postProcessItemFn,
|
|
void *postProcessItemData,
|
|
CRAWL_ExitFn exitFn,
|
|
void *exitData);
|
|
|
|
/*
|
|
Destroys the crawler and all memory associated with it. The crawler instance should not be
|
|
used after calling this function.
|
|
*/
|
|
PR_EXTERN(void)
|
|
CRAWL_DestroyCrawler(CRAWL_Crawler crawler);
|
|
|
|
/*
|
|
Starts crawling from the url. If its content type is text/html, links may be traversed. This function
|
|
returns as soon as the first network request is issued.
|
|
*/
|
|
PR_EXTERN(void)
|
|
CRAWL_StartCrawler(CRAWL_Crawler crawler, char *url);
|
|
|
|
/*
|
|
Stops crawling at the next link. This function returns immediately and cannot fail.
|
|
*/
|
|
PR_EXTERN(void)
|
|
CRAWL_StopCrawler(CRAWL_Crawler crawler);
|
|
|
|
/*
|
|
Returns the crawler error code. This function returns immediately and cannot fail.
|
|
*/
|
|
PR_EXTERN(CRAWL_Error)
|
|
CRAWL_GetError(CRAWL_Crawler crawler);
|
|
|
|
/*
|
|
Returns true if the crawler has stopped, which is the case before and after crawling. Returns
|
|
immediately and cannot fail.
|
|
*/
|
|
PR_EXTERN(PRBool)
|
|
CRAWL_IsStopped(CRAWL_Crawler crawler);
|
|
|
|
NSPR_END_EXTERN_C
|
|
|
|
/*
|
|
Stream function for crawling resources. Resources are not parsed, but the crawler checks the
|
|
content length to see if the cache would be exceeded.
|
|
*/
|
|
PUBLIC NET_StreamClass*
|
|
CRAWL_CrawlerResourceConverter(int format_out,
|
|
void *data_object,
|
|
URL_Struct *URL_s,
|
|
MWContext *window_id);
|
|
|
|
#endif /* crawler_h___ */
|