gecko-dev/network/main/crawler.h

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.0 (the "NPL"); you may not use this file except in
 * compliance with the NPL.  You may obtain a copy of the NPL at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the NPL is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
 * for the specific language governing rights and limitations under the
 * NPL.
 *
 * The Initial Developer of this code under the NPL is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
 * Reserved.
 */
/*** crawler.h ****************************************************/
/*   description:		html crawler                              */


 /********************************************************************

	The crawler scans html pages and the links in those pages to a specified 
	depth in a breadth first manner, optionally caching them in an external cache. 
	Items are crawled sequentially - an item must finish being crawled or cached 
	before the next item is crawled. Multiple instances of the crawler may be running 
	at the same time.

	Depth = 1 means that only the initial page, and any images and resources that 
	it contains, are cached.

	Depth = n means that the crawler will crawl to n levels deep. Assuming the 
	maximum cache size is sufficient, the crawler will cache images and resources 
	for each page it encounters. Normally, resources are cached after all the pages 
	for a given level have been processed, but some resources are considered "required", 
	meaning they will be cached immediately after the page containing them has been 
	processed. An example of a "required" resource is a stylesheet.
 
	The crawler obeys the robots.txt directives on a site, which may allow or deny access 
	to specific urls or directories. The robots.txt file is by convention at the top level
	of a site.

	The type of links that are crawled are determined in pagescan.c.
	The parsing code is in htmparse.c
	The robots.txt parser is in robotxt.c

  $Revision: 1.1 $
  $Date: 1998/04/30 20:53:21 $

 *********************************************************************/

#ifndef crawler_h___
#define crawler_h___

#include "ntypes.h"		/* for MWContext */
#include "prtypes.h"	/* for PRBool */
#include "net.h"		/* for ExtCacheDBInfo, URL_Struct */

/* Error codes */
typedef PRUint16 CRAWL_Error;

#define CRAWL_CACHE_FULL		((CRAWL_Error)0x0001)
#define CRAWL_NO_MEMORY			((CRAWL_Error)0x0002)
#define CRAWL_SERVER_ERR		((CRAWL_Error)0x0004)
#define CRAWL_INTERRUPTED		((CRAWL_Error)0x0008)

/* these error codes indicate if and how the cache has been updated and are only
   set if CRAWL_MakeCrawler was called with manageCache set to true. Note that replaced
   links may not have been reported as such if the server does not provide a last
   modified date.
*/
#define CRAWL_NEW_LINK			((CRAWL_Error)0x0010)
#define CRAWL_REPLACED_LINK		((CRAWL_Error)0x0020)
#define CRAWL_REMOVED_LINK		((CRAWL_Error)0x0040)

/* Most of the APIs require a reference to CRAWL_Crawler, which is created by CRAWL_MakeCrawler. */
typedef struct _CRAWL_CrawlerStruct *CRAWL_Crawler;

/*
 * Typedef for a callback executed when an item has been processed.
 */
 typedef void
(PR_CALLBACK *CRAWL_PostProcessItemFn)(CRAWL_Crawler crawler, URL_Struct *url_s, PRBool isCached, void *data);

/*
 * Typedef for a callback executed when the crawler is done.
 */
 typedef void
(PR_CALLBACK *CRAWL_ExitFn)(CRAWL_Crawler crawler, void *data);

/****************************************************************************************/
/* public API																			*/
/****************************************************************************************/

NSPR_BEGIN_EXTERN_C

/* 
	Creates a crawler which may be used for one crawling request. Subsequent requests
	to crawl urls should use a separate crawler instance. Returns NULL if not enough
	memory is available, or the depth is less than 1.

  Parameters:
	context - needed by netlib (the crawler does not check this parameter)
	siteName - url of the site
	stayInSite - whether to restrict crawling to the site named.
	manageCache - whether to maintain a local file describing the cache contents. 
		If true, the crawler uses the file to remove dangling links from the cache
		the next time it is invoked with the same cache. This is not guaranteed to
		work correctly if another crawling instance uses the same cache simultaneously.
	cache - the external cache. This may be NULL if the crawled items do not need
		to be put in an external cache.
	postProcessItemFn - a function which is called after each item has been handled
		by netlib. This may be NULL.
	postProcessItemData - this data is supplied as a parameter to the postProcessItemFn
		and is opaque to the crawler. This may be NULL.
	exitFn - a function which is called when the crawler is done or has terminated
		prematurely (because the cache is full, or no memory is available). This may be NULL.
	exitData - this data is supplied as a parameter to the exitFn and is opaque to
		the crawler. This may be NULL.
*/
PR_EXTERN(CRAWL_Crawler) 
CRAWL_MakeCrawler(MWContext *context, 
										 char *siteName, 
										 uint8 depth, 
										 PRBool stayInSite,
										 PRBool manageCache,
										 ExtCacheDBInfo *cache,
										 CRAWL_PostProcessItemFn postProcessItemFn,
										 void *postProcessItemData,
										 CRAWL_ExitFn exitFn,
										 void *exitData);

/* 
	Destroys the crawler and all memory associated with it. The crawler instance should not be
	used after calling this function.
*/
PR_EXTERN(void) 
CRAWL_DestroyCrawler(CRAWL_Crawler crawler);

/* 
	Starts crawling from the url. If its content type is text/html, links may be traversed. This function
	returns as soon as the first network request is issued. 
*/
PR_EXTERN(void) 
CRAWL_StartCrawler(CRAWL_Crawler crawler, char *url);

/* 
	Stops crawling at the next link. This function returns immediately and cannot fail. 
*/
PR_EXTERN(void) 
CRAWL_StopCrawler(CRAWL_Crawler crawler);

/* 
	Returns the crawler error code. This function returns immediately and cannot fail. 
*/
PR_EXTERN(CRAWL_Error) 
CRAWL_GetError(CRAWL_Crawler crawler);

/* 
	Returns true if the crawler has stopped, which is the case before and after crawling. Returns
	immediately and cannot fail.
*/
PR_EXTERN(PRBool) 
CRAWL_IsStopped(CRAWL_Crawler crawler);

NSPR_END_EXTERN_C

/* 
	Stream function for crawling resources. Resources are not parsed, but the crawler checks the
	content length to see if the cache would be exceeded.
*/
PUBLIC NET_StreamClass*
CRAWL_CrawlerResourceConverter(int format_out,
								void *data_object,
								URL_Struct *URL_s,
								MWContext  *window_id);

#endif /* crawler_h___ */
Add netlib in new location. This is a (mostly) unmodified version of lib/libnet broken up into functional areas It should be binary compatible with the lib created by lib/libnet but is now 10 libs. The only substantial things changed in the version is the include files 1998-05-01 00:53:59 +04:00			`/* -- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 --`
			`*`
			`* The contents of this file are subject to the Netscape Public License`
			`* Version 1.0 (the "NPL"); you may not use this file except in`
			`* compliance with the NPL. You may obtain a copy of the NPL at`
			`* http://www.mozilla.org/NPL/`
			`*`
			`* Software distributed under the NPL is distributed on an "AS IS" basis,`
			`* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL`
			`* for the specific language governing rights and limitations under the`
			`* NPL.`
			`*`
			`* The Initial Developer of this code under the NPL is Netscape`
			`* Communications Corporation. Portions created by Netscape are`
			`* Copyright (C) 1998 Netscape Communications Corporation. All Rights`
			`* Reserved.`
			`*/`
			`/* crawler.h **************************************************/`
			`/* description: html crawler */`


			`/********************************************************************`

			`The crawler scans html pages and the links in those pages to a specified`
			`depth in a breadth first manner, optionally caching them in an external cache.`
			`Items are crawled sequentially - an item must finish being crawled or cached`
			`before the next item is crawled. Multiple instances of the crawler may be running`
			`at the same time.`

			`Depth = 1 means that only the initial page, and any images and resources that`
			`it contains, are cached.`

			`Depth = n means that the crawler will crawl to n levels deep. Assuming the`
			`maximum cache size is sufficient, the crawler will cache images and resources`
			`for each page it encounters. Normally, resources are cached after all the pages`
			`for a given level have been processed, but some resources are considered "required",`
			`meaning they will be cached immediately after the page containing them has been`
			`processed. An example of a "required" resource is a stylesheet.`

			`The crawler obeys the robots.txt directives on a site, which may allow or deny access`
			`to specific urls or directories. The robots.txt file is by convention at the top level`
			`of a site.`

			`The type of links that are crawled are determined in pagescan.c.`
			`The parsing code is in htmparse.c`
			`The robots.txt parser is in robotxt.c`

			$Revision: 1.1 $
			$Date: 1998/04/30 20:53:21 $

			`*********************************************************************/`

			`#ifndef crawler_h___`
			`#define crawler_h___`

			`#include "ntypes.h" /* for MWContext */`
			`#include "prtypes.h" /* for PRBool */`
			`#include "net.h" /* for ExtCacheDBInfo, URL_Struct */`

			`/* Error codes */`
			`typedef PRUint16 CRAWL_Error;`

			`#define CRAWL_CACHE_FULL ((CRAWL_Error)0x0001)`
			`#define CRAWL_NO_MEMORY ((CRAWL_Error)0x0002)`
			`#define CRAWL_SERVER_ERR ((CRAWL_Error)0x0004)`
			`#define CRAWL_INTERRUPTED ((CRAWL_Error)0x0008)`

			`/* these error codes indicate if and how the cache has been updated and are only`
			`set if CRAWL_MakeCrawler was called with manageCache set to true. Note that replaced`
			`links may not have been reported as such if the server does not provide a last`
			`modified date.`
			`*/`
			`#define CRAWL_NEW_LINK ((CRAWL_Error)0x0010)`
			`#define CRAWL_REPLACED_LINK ((CRAWL_Error)0x0020)`
			`#define CRAWL_REMOVED_LINK ((CRAWL_Error)0x0040)`

			`/* Most of the APIs require a reference to CRAWL_Crawler, which is created by CRAWL_MakeCrawler. */`
			`typedef struct _CRAWL_CrawlerStruct *CRAWL_Crawler;`

			`/*`
			`* Typedef for a callback executed when an item has been processed.`
			`*/`
			`typedef void`
			`(PR_CALLBACK CRAWL_PostProcessItemFn)(CRAWL_Crawler crawler, URL_Struct url_s, PRBool isCached, void *data);`

			`/*`
			`* Typedef for a callback executed when the crawler is done.`
			`*/`
			`typedef void`
			`(PR_CALLBACK CRAWL_ExitFn)(CRAWL_Crawler crawler, void data);`

			`/****************************************************************************************/`
			`/* public API */`
			`/****************************************************************************************/`

			`NSPR_BEGIN_EXTERN_C`

			`/*`
			`Creates a crawler which may be used for one crawling request. Subsequent requests`
			`to crawl urls should use a separate crawler instance. Returns NULL if not enough`
			`memory is available, or the depth is less than 1.`

			`Parameters:`
			`context - needed by netlib (the crawler does not check this parameter)`
			`siteName - url of the site`
			`stayInSite - whether to restrict crawling to the site named.`
			`manageCache - whether to maintain a local file describing the cache contents.`
			`If true, the crawler uses the file to remove dangling links from the cache`
			`the next time it is invoked with the same cache. This is not guaranteed to`
			`work correctly if another crawling instance uses the same cache simultaneously.`
			`cache - the external cache. This may be NULL if the crawled items do not need`
			`to be put in an external cache.`
			`postProcessItemFn - a function which is called after each item has been handled`
			`by netlib. This may be NULL.`
			`postProcessItemData - this data is supplied as a parameter to the postProcessItemFn`
			`and is opaque to the crawler. This may be NULL.`
			`exitFn - a function which is called when the crawler is done or has terminated`
			`prematurely (because the cache is full, or no memory is available). This may be NULL.`
			`exitData - this data is supplied as a parameter to the exitFn and is opaque to`
			`the crawler. This may be NULL.`
			`*/`
			`PR_EXTERN(CRAWL_Crawler)`
			`CRAWL_MakeCrawler(MWContext *context,`
			`char *siteName,`
			`uint8 depth,`
			`PRBool stayInSite,`
			`PRBool manageCache,`
			`ExtCacheDBInfo *cache,`
			`CRAWL_PostProcessItemFn postProcessItemFn,`
			`void *postProcessItemData,`
			`CRAWL_ExitFn exitFn,`
			`void *exitData);`

			`/*`
			`Destroys the crawler and all memory associated with it. The crawler instance should not be`
			`used after calling this function.`
			`*/`
			`PR_EXTERN(void)`
			`CRAWL_DestroyCrawler(CRAWL_Crawler crawler);`

			`/*`
			`Starts crawling from the url. If its content type is text/html, links may be traversed. This function`
			`returns as soon as the first network request is issued.`
			`*/`
			`PR_EXTERN(void)`
			`CRAWL_StartCrawler(CRAWL_Crawler crawler, char *url);`

			`/*`
			`Stops crawling at the next link. This function returns immediately and cannot fail.`
			`*/`
			`PR_EXTERN(void)`
			`CRAWL_StopCrawler(CRAWL_Crawler crawler);`

			`/*`
			`Returns the crawler error code. This function returns immediately and cannot fail.`
			`*/`
			`PR_EXTERN(CRAWL_Error)`
			`CRAWL_GetError(CRAWL_Crawler crawler);`

			`/*`
			`Returns true if the crawler has stopped, which is the case before and after crawling. Returns`
			`immediately and cannot fail.`
			`*/`
			`PR_EXTERN(PRBool)`
			`CRAWL_IsStopped(CRAWL_Crawler crawler);`

			`NSPR_END_EXTERN_C`

			`/*`
			`Stream function for crawling resources. Resources are not parsed, but the crawler checks the`
			`content length to see if the cache would be exceeded.`
			`*/`
			`PUBLIC NET_StreamClass*`
			`CRAWL_CrawlerResourceConverter(int format_out,`
			`void *data_object,`
			`URL_Struct *URL_s,`
			`MWContext *window_id);`

			`#endif /* crawler_h___ */`