gecko-dev/network/main/robotxt.h

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.0 (the "NPL"); you may not use this file except in
 * compliance with the NPL.  You may obtain a copy of the NPL at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the NPL is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
 * for the specific language governing rights and limitations under the
 * NPL.
 *
 * The Initial Developer of this code under the NPL is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
 * Reserved.
 */
/*** robotxt.h ****************************************************/
/*   description:		parses the robots.txt file                */
/*                      - not dependent on the crawler            */


 /********************************************************************
	See the robots.txt specification at:

	http://info.webcrawler.com/mak/projects/robots/norobots.html (original spec)
	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

	Note: the original spec says that at least one Disallow field must be present
	in a record. That is what I am following.

  $Revision: 1.1 $
  $Date: 1998/04/30 20:53:33 $

 *********************************************************************/

#ifndef robotctl_h___
#define robotctl_h___

#include "prtypes.h"
#include "ntypes.h"
#include "net.h"

typedef uint8 CRAWL_RobotControlStatus;
#define CRAWL_ROBOT_DISALLOWED			((CRAWL_RobotControlStatus)0x00)
#define CRAWL_ROBOT_ALLOWED				((CRAWL_RobotControlStatus)0x01)
#define CRAWL_ROBOTS_TXT_NOT_QUERIED	((CRAWL_RobotControlStatus)0x02)

typedef struct _CRAWL_RobotControlStruct *CRAWL_RobotControl;

/*
 * Typedef for function callback called after robots.txt is read.
 */
 typedef void
(PR_CALLBACK *CRAWL_RobotControlStatusFunc)(void *data);

/* stream function */
PUBLIC NET_StreamClass*
CRAWL_RobotsTxtConverter(int format_out,
						void *data_object,
						URL_Struct *URL_s,
						MWContext  *window_id);

/****************************************************************************************/
/* public API																			*/
/****************************************************************************************/

NSPR_BEGIN_EXTERN_C

/* Creates a robot control for the site.
 Parameters:
	context - context for libnet
	site - protocol and host portion of url. "/robots.txt" will be appended to this to get the
		location of robots.txt.
*/
PR_EXTERN(CRAWL_RobotControl)
CRAWL_MakeRobotControl(MWContext *context, char *site);

/* Destroys a robot control and all memory associated with it (except for the context or the
   opaque data supplied to CRAWL_ReadRobotControlFile)
*/
PR_EXTERN(void)
CRAWL_DestroyRobotControl(CRAWL_RobotControl control);

/* Parses the robots.txt at the site specified in the control, and performs a callback when
   it is done. This function returns after issuing a request to netlib.
   Parameters:
	control - the robot control for the site
	func - completion callback
	data - data to provide to the callback which is opaque to the robots.txt parser
	freeData - if true, frees data (previous param) on completion
*/
PR_EXTERN(PRBool)
CRAWL_ReadRobotControlFile(CRAWL_RobotControl control, CRAWL_RobotControlStatusFunc func, void *data, PRBool freeData);

/* Returns a status code indicating the robot directive for the url supplied */
PR_EXTERN(CRAWL_RobotControlStatus)
CRAWL_GetRobotControl(CRAWL_RobotControl, char *url);

NSPR_END_EXTERN_C

#endif