2005-01-19 20:16:09 +03:00
|
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
|
|
* Version: MPL 1.1
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/MPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* License.
|
|
|
|
*
|
|
|
|
* The Original Code is SniffURI.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is
|
|
|
|
* Erik van der Poel <erik@vanderpoel.org>.
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 1998-2005
|
|
|
|
* the Initial Developer. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Contributor(s):
|
|
|
|
*
|
|
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
|
|
|
|
#include "all.h"
|
|
|
|
|
|
|
|
#define OUTPUT_DIRECTORY "test/robot/"
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
typedef struct Entry
|
|
|
|
{
|
|
|
|
int count;
|
|
|
|
unsigned char *viewURL;
|
|
|
|
unsigned char *url;
|
|
|
|
} Entry;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
typedef struct Robot
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
int slot;
|
|
|
|
int count;
|
|
|
|
View *view;
|
|
|
|
URL *url;
|
|
|
|
char viewFile[1024];
|
|
|
|
int viewFileAdded;
|
|
|
|
char viewURL[1024];
|
2005-01-21 00:09:40 +03:00
|
|
|
} Robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
typedef struct StatusEntry
|
|
|
|
{
|
|
|
|
time_t time;
|
|
|
|
char *message;
|
|
|
|
char *file;
|
|
|
|
int line;
|
|
|
|
} StatusEntry;
|
|
|
|
|
|
|
|
typedef struct TimeEntry
|
|
|
|
{
|
|
|
|
char *task;
|
|
|
|
int count;
|
|
|
|
double total;
|
|
|
|
double min;
|
|
|
|
double max;
|
|
|
|
} TimeEntry;
|
|
|
|
|
|
|
|
typedef void (*Handler)(int fd);
|
|
|
|
|
|
|
|
typedef struct FD
|
|
|
|
{
|
|
|
|
Handler handler;
|
|
|
|
FILE *file;
|
|
|
|
} FD;
|
|
|
|
|
|
|
|
#define numberOfSlots 64
|
2005-01-19 20:16:09 +03:00
|
|
|
static Thread slots[numberOfSlots];
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static Thread statusThread;
|
2000-02-01 21:24:20 +03:00
|
|
|
static StatusEntry statusEntries[numberOfSlots];
|
|
|
|
static int sortedStatusEntries[numberOfSlots];
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static unsigned short mainPort = 40404;
|
2000-02-01 21:24:20 +03:00
|
|
|
static int maxFD = -1;
|
|
|
|
static FD **table = NULL;
|
|
|
|
static fd_set fdSet;
|
|
|
|
|
|
|
|
static TimeEntry times[] =
|
|
|
|
{
|
|
|
|
{ "connect success", 0, 0.0, DBL_MAX, DBL_MIN },
|
|
|
|
{ "connect failure", 0, 0.0, DBL_MAX, DBL_MIN },
|
|
|
|
{ "gethostbyname_r success", 0, 0.0, DBL_MAX, DBL_MIN },
|
|
|
|
{ "gethostbyname_r failure", 0, 0.0, DBL_MAX, DBL_MIN },
|
|
|
|
{ "readStream", 0, 0.0, DBL_MAX, DBL_MIN },
|
|
|
|
{ "total", 0, 0.0, DBL_MAX, DBL_MIN }
|
|
|
|
};
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
static unsigned char *limitURLs[] =
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
"http://somehost/",
|
|
|
|
"http://somehost.somedomain.com/",
|
|
|
|
*/
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static char *outFile = OUTPUT_DIRECTORY "index.html";
|
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
static char *limitDomains[16];
|
|
|
|
static int limitDomainsIndex = 0;
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static int count = 0;
|
|
|
|
#define LIMIT 50
|
|
|
|
#define INTERVAL ((LIMIT / 100) ? (LIMIT / 100) : 5)
|
|
|
|
|
|
|
|
static URL *currURL = NULL;
|
2000-02-01 21:24:20 +03:00
|
|
|
static URL *lastURL = NULL;
|
|
|
|
static URL *urls = NULL;
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static char *waiting = "waiting";
|
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
#ifdef ROBOT_LOG_ATTRIBUTES
|
|
|
|
static HashTable *attributeTable = NULL;
|
|
|
|
#endif
|
|
|
|
static HashTable *contentTypeTable = NULL;
|
|
|
|
static HashTable *httpCharsetTable = NULL;
|
|
|
|
static HashTable *httpHeaderTable = NULL;
|
|
|
|
static HashTable *metaCharsetTable = NULL;
|
|
|
|
static HashTable *schemeTable = NULL;
|
|
|
|
#ifdef ROBOT_LOG_TAGS
|
|
|
|
static HashTable *tagTable = NULL;
|
|
|
|
#endif
|
|
|
|
static HashTable *urlTable = NULL;
|
|
|
|
|
|
|
|
static HashTable *knownBadTags = NULL;
|
|
|
|
|
|
|
|
static unsigned char *badTags[] =
|
|
|
|
{
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static FILE *statsOut = NULL;
|
|
|
|
|
|
|
|
static char *firstURL = NULL;
|
|
|
|
static unsigned char *startTime = NULL;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
|
|
|
robotStatus(App *app, char *message, char *file, int line)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
StatusEntry *entry;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
if (!app)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
|
|
|
entry = &statusEntries[robot->slot];
|
2000-02-01 21:24:20 +03:00
|
|
|
time(&entry->time);
|
|
|
|
entry->message = message;
|
|
|
|
entry->file = file;
|
|
|
|
entry->line = line;
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
|
|
|
robotTime(App *app, int task, struct timeval *before)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
struct timeval after;
|
|
|
|
double span;
|
|
|
|
|
|
|
|
gettimeofday(&after, NULL);
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
threadMutexLock();
|
2000-02-01 21:24:20 +03:00
|
|
|
span = (((after.tv_sec - before->tv_sec) * 1000000.0) +
|
|
|
|
after.tv_usec - before->tv_usec);
|
|
|
|
times[task].total += span;
|
|
|
|
if (span < times[task].min)
|
|
|
|
{
|
|
|
|
times[task].min = span;
|
|
|
|
}
|
|
|
|
if (span > times[task].max)
|
|
|
|
{
|
|
|
|
times[task].max = span;
|
|
|
|
}
|
|
|
|
times[task].count++;
|
2005-01-19 20:16:09 +03:00
|
|
|
threadMutexUnlock();
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(HashTable *table, Robot *robot, unsigned char *url, unsigned char *str)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
Entry *entry;
|
|
|
|
HashEntry *hashEntry;
|
|
|
|
|
|
|
|
hashEntry = hashLookup(table, str);
|
|
|
|
if (hashEntry)
|
|
|
|
{
|
|
|
|
((Entry *) hashEntry->value)->count++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
entry = calloc(sizeof(Entry), 1);
|
|
|
|
if (!entry)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "cannot calloc Entry\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
entry->count = 1;
|
2005-01-21 00:09:40 +03:00
|
|
|
entry->viewURL = copyString((unsigned char *) robot->viewURL);
|
|
|
|
robot->viewFileAdded = 1;
|
2000-02-01 21:24:20 +03:00
|
|
|
entry->url = copyString(url);
|
|
|
|
hashAdd(table, copyString(str), entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
freeEntry(unsigned char *str, void *e)
|
|
|
|
{
|
|
|
|
free(str);
|
|
|
|
if (e)
|
|
|
|
{
|
|
|
|
free(((Entry *) e)->url);
|
|
|
|
free(((Entry *) e)->viewURL);
|
|
|
|
free(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
robotScheme(Robot *robot, unsigned char *url, unsigned char *scheme)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(schemeTable, robot, url, scheme);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
addURLFunc(App *app, URL *url)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
if (!url->scheme)
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robotScheme(robot, url->url, url->scheme);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
if (strcmp((char *) url->scheme, "http"))
|
|
|
|
{
|
|
|
|
urlFree(url);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
threadMutexLock();
|
2000-02-01 21:24:20 +03:00
|
|
|
lastURL->next = url;
|
|
|
|
lastURL = url;
|
2005-01-19 20:16:09 +03:00
|
|
|
if (!currURL)
|
|
|
|
{
|
|
|
|
currURL = url;
|
|
|
|
}
|
|
|
|
threadCondSignal();
|
|
|
|
threadMutexUnlock();
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTTP(App *app, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTTP(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTTPBody(App *app, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTTPHeaderName(App *app, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
unsigned char *name;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
name = bufCopyLower(buf);
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(httpHeaderTable, robot, robot->url->url, name);
|
2000-02-01 21:24:20 +03:00
|
|
|
free(name);
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTTPHeaderName(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTTPHeaderValue(App *app, Buf *buf, unsigned char *url)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTTPHeaderValue(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTML(App *app, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTML(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTMLText(App *app, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
unsigned char *p;
|
|
|
|
unsigned char *str;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTMLText(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
str = bufCopy(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
p = str;
|
|
|
|
while (*p)
|
|
|
|
{
|
|
|
|
if
|
|
|
|
(
|
|
|
|
(p[0] == '&' ) &&
|
|
|
|
(p[1] == '#' ) &&
|
|
|
|
(p[2] == '1' ) &&
|
|
|
|
(p[3] == '4' ) &&
|
|
|
|
(p[4] != '\0') &&
|
|
|
|
(p[5] == ';' )
|
|
|
|
)
|
|
|
|
{
|
|
|
|
if (p[4] == '7')
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
fprintf(stderr, "147: %s\n", robot->url->url);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
else if (p[4] == '8')
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
fprintf(stderr, "148: %s\n", robot->url->url);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTMLTag(App *app, HTML *html, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
HashEntry *tagEntry;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
if (html->tagIsKnown)
|
|
|
|
{
|
|
|
|
#ifdef ROBOT_LOG_TAGS
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(tagTable, robot, robot->url->url, html->tag);
|
2000-02-01 21:24:20 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
tagEntry = hashLookup(knownBadTags, html->tag);
|
|
|
|
if (!tagEntry)
|
|
|
|
{
|
|
|
|
/* XXX
|
|
|
|
printf("\t\"%s\",\n", html->tag);
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTMLTag(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTMLAttributeName(App *app, HTML *html, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
if (html->tagIsKnown)
|
|
|
|
{
|
|
|
|
#ifdef ROBOT_LOG_ATTRIBUTES
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(attributeTable, robot, robot->url->url,
|
2000-02-01 21:24:20 +03:00
|
|
|
html->currentAttribute->name);
|
|
|
|
#endif
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
viewHTMLAttributeName(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
2005-01-25 11:22:29 +03:00
|
|
|
robotHTMLAttributeValue(App *app, HTML *html, Buf *buf)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-02-03 04:06:24 +03:00
|
|
|
viewHTMLAttributeValue(app, buf);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-02-03 04:06:24 +03:00
|
|
|
static void
|
|
|
|
robotHTMLDeclaration(App *app, Buf *buf)
|
|
|
|
{
|
|
|
|
viewHTMLDeclaration(app, buf);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-02-03 04:06:24 +03:00
|
|
|
static void
|
|
|
|
robotHTMLProcessingInstruction(App *app, Buf *buf)
|
|
|
|
{
|
|
|
|
viewHTMLProcessingInstruction(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
|
|
|
robotContentType(App *app, unsigned char *contentType)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(contentTypeTable, robot, robot->url->url, contentType);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
metaHandler(App *app, HTML *html)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
HTMLAttribute *attr;
|
|
|
|
unsigned char *charset;
|
|
|
|
ContentType *contentType;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
attr = html->attributes;
|
|
|
|
while (attr)
|
|
|
|
{
|
|
|
|
if
|
|
|
|
(
|
|
|
|
(!strcmp((char *) attr->name, "http-equiv")) &&
|
|
|
|
(attr->value) &&
|
|
|
|
(!strcasecmp((char *) attr->value, "content-type"))
|
|
|
|
)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
attr = attr->next;
|
|
|
|
}
|
|
|
|
if (attr)
|
|
|
|
{
|
|
|
|
contentType =
|
|
|
|
mimeParseContentType(html->currentAttribute->value);
|
|
|
|
charset = mimeGetContentTypeParameter(contentType, "charset");
|
|
|
|
mimeFreeContentType(contentType);
|
|
|
|
if (charset)
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(metaCharsetTable, robot, robot->url->url,
|
2000-02-01 21:24:20 +03:00
|
|
|
lowerCase(charset));
|
|
|
|
free(charset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
|
|
|
tagHandler(App *app, HTML *html)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
static void
|
|
|
|
robotHTTPCharSet(App *app, unsigned char *charset)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
addEntry(httpCharsetTable, robot, robot->url->url, charset);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
printEntry(HashEntry *hashEntry)
|
|
|
|
{
|
|
|
|
Entry *entry;
|
|
|
|
unsigned char *key;
|
|
|
|
unsigned char *url;
|
|
|
|
|
|
|
|
entry = hashEntry->value;
|
|
|
|
key = toHTML(hashEntry->key);
|
|
|
|
url = toHTML(entry->url);
|
|
|
|
fprintf
|
|
|
|
(
|
|
|
|
statsOut,
|
|
|
|
"<tr><td>%s</td><td align=right>%d</td>"
|
|
|
|
"<td><a href=%s>View Source</a></td>"
|
|
|
|
"<td><a href=%s>%s</a></td></tr>\n",
|
|
|
|
key,
|
|
|
|
entry->count,
|
|
|
|
entry->viewURL,
|
|
|
|
url,
|
|
|
|
url
|
|
|
|
);
|
|
|
|
free(key);
|
|
|
|
free(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
printTable(HashTable *table, char *column1)
|
|
|
|
{
|
|
|
|
fprintf(statsOut, "<table>\n");
|
|
|
|
fprintf
|
|
|
|
(
|
|
|
|
statsOut,
|
|
|
|
"<tr bgcolor=#cccccc><td align=center>%s</td><td>Count</td>"
|
|
|
|
"<td align=center>View Source</td>"
|
|
|
|
"<td align=center>Example URL</td></tr>\n",
|
|
|
|
column1
|
|
|
|
);
|
|
|
|
hashEnumerate(table, printEntry);
|
|
|
|
fprintf(statsOut, "</table>\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
printTimes(FILE *file)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
fprintf(file, "<table>\n");
|
|
|
|
|
|
|
|
fprintf(file, "<tr bgcolor=#cccccc>");
|
|
|
|
fprintf(file, "<td>Task</td>");
|
|
|
|
fprintf(file, "<td>Count</td>");
|
|
|
|
fprintf(file, "<td>Average</td>");
|
|
|
|
fprintf(file, "<td>Min</td>");
|
|
|
|
fprintf(file, "<td>Max</td>");
|
|
|
|
fprintf(file, "</tr>");
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
for (i = 0; i < appTimeMax; i++)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
TimeEntry *entry;
|
|
|
|
|
|
|
|
entry = ×[i];
|
|
|
|
fprintf(file, "<tr>");
|
|
|
|
fprintf(file, "<td>%s</td>", entry->task);
|
|
|
|
fprintf(file, "<td align=right>%d</td>", entry->count);
|
|
|
|
if (entry->count)
|
|
|
|
{
|
|
|
|
fprintf(file, "<td align=right>%f</td>",
|
|
|
|
(entry->total / entry->count) / 1000000);
|
|
|
|
fprintf(file, "<td align=right>%f</td>",
|
|
|
|
entry->min / 1000000);
|
|
|
|
fprintf(file, "<td align=right>%f</td>",
|
|
|
|
entry->max / 1000000);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
fprintf(file, "<td> </td>");
|
|
|
|
fprintf(file, "<td> </td>");
|
|
|
|
fprintf(file, "<td> </td>");
|
|
|
|
}
|
|
|
|
fprintf(file, "</tr>");
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(file, "</table>\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-19 20:16:09 +03:00
|
|
|
printStats(void)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
char backup[1024];
|
|
|
|
char **limit;
|
|
|
|
time_t theTime;
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
sprintf(backup, "%s.bak", outFile);
|
|
|
|
rename(outFile, backup);
|
|
|
|
statsOut = fopen(outFile, "w");
|
2000-02-01 21:24:20 +03:00
|
|
|
fprintf(statsOut, "<html><head><title>Stats</title></head><body>\n");
|
|
|
|
fprintf(statsOut, "<table bgcolor=#cccccc>\n");
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>Start Time</td><td align=right>%s</td></tr>\n",
|
|
|
|
startTime);
|
|
|
|
time(&theTime);
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>Time of This File</td><td align=right>%s</td></tr>\n",
|
|
|
|
ctime(&theTime));
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>Root URL</td><td align=right>%s</td></tr>\n",
|
|
|
|
firstURL);
|
|
|
|
fprintf(statsOut, "<tr><td>Domain Limits</td><td align=right>");
|
|
|
|
limit = limitDomains;
|
|
|
|
while (*limit)
|
|
|
|
{
|
|
|
|
fprintf(statsOut, "%s ", *limit);
|
|
|
|
limit++;
|
|
|
|
}
|
|
|
|
fprintf(statsOut, "</td></tr>\n");
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>URLs Attempted</td><td align=right>%d</td></tr>\n",
|
|
|
|
count);
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>DNS Successes</td><td align=right>%d</td></tr>\n",
|
|
|
|
netGetDNSCount());
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>Connect Successes</td><td align=right>%d</td></tr>\n",
|
|
|
|
netGetConnectCount());
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>Non-empty HTTP Responses</td>"
|
|
|
|
"<td align=right>%d</td></tr>\n",
|
|
|
|
httpGetNonEmptyHTTPResponseCount());
|
|
|
|
fprintf(statsOut,
|
|
|
|
"<tr><td>HTTP/1.0 or Greater</td>"
|
|
|
|
"<td align=right>%d</td></tr>\n",
|
|
|
|
httpGetHTTP10OrGreaterCount());
|
|
|
|
fprintf(statsOut, "</table>\n");
|
|
|
|
printTimes(statsOut);
|
|
|
|
printTable(schemeTable, "URL Scheme");
|
|
|
|
printTable(httpHeaderTable, "HTTP Header");
|
|
|
|
printTable(contentTypeTable, "Content-Type");
|
|
|
|
printTable(httpCharsetTable, "HTTP charset");
|
|
|
|
printTable(metaCharsetTable, "META charset");
|
|
|
|
#ifdef ROBOT_LOG_TAGS
|
|
|
|
printTable(tagTable, "HTML Tag");
|
|
|
|
#endif
|
|
|
|
#ifdef ROBOT_LOG_ATTRIBUTES
|
|
|
|
printTable(attributeTable, "HTML Attribute");
|
|
|
|
#endif
|
|
|
|
fprintf(statsOut, "</body></html>\n");
|
|
|
|
fclose(statsOut);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
openViewFile(App *app)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
|
|
|
|
|
|
|
robot = app->data;
|
|
|
|
sprintf(robot->viewURL, "%010d.html", robot->count);
|
|
|
|
sprintf(robot->viewFile, "%s%s", OUTPUT_DIRECTORY, robot->viewURL);
|
2005-01-19 20:16:09 +03:00
|
|
|
/*
|
2005-01-21 00:09:40 +03:00
|
|
|
sprintf(robot->viewFile, "/dev/null");
|
2005-01-19 20:16:09 +03:00
|
|
|
*/
|
2005-01-21 00:09:40 +03:00
|
|
|
robot->viewFileAdded = 0;
|
|
|
|
app->view.out = fopen(robot->viewFile, "w");
|
|
|
|
if (!app->view.out)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
fprintf(stderr, "cannot open %s for writing: %s\n",
|
2005-01-21 00:09:40 +03:00
|
|
|
robot->viewFile, strerror(errno));
|
2000-02-01 21:24:20 +03:00
|
|
|
exit(0);
|
|
|
|
}
|
2005-01-22 02:50:42 +03:00
|
|
|
fprintf(app->view.out,
|
|
|
|
"<html><head><title>View</title></head><body><pre>");
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
closeViewFile(App *app)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
|
|
|
|
|
|
|
robot = app->data;
|
2005-01-22 02:50:42 +03:00
|
|
|
fprintf(app->view.out, "</pre></body></html>");
|
2005-01-21 00:09:40 +03:00
|
|
|
fclose(app->view.out);
|
|
|
|
if (!robot->viewFileAdded)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
unlink(robot->viewFile);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-21 00:09:40 +03:00
|
|
|
robot->viewFileAdded = 0;
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
processURL(App *app)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot *robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
struct timeval theTime;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
robot = app->data;
|
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
gettimeofday(&theTime, NULL);
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
app->status(app, "processURL", __FILE__, __LINE__);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
openViewFile(app);
|
2005-01-25 11:22:29 +03:00
|
|
|
httpFree(httpProcess(app, robot->url, NULL, NULL));
|
2005-01-21 00:09:40 +03:00
|
|
|
closeViewFile(app);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
app->status(app, "processURL done", __FILE__, __LINE__);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
app->time(app, appTimeTotal, &theTime);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static void *
|
|
|
|
startHere(void *a)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
Robot robot;
|
|
|
|
App *app;
|
|
|
|
|
|
|
|
robot.slot = (int) a;
|
|
|
|
|
|
|
|
app = appAlloc();
|
|
|
|
app->status = robotStatus;
|
|
|
|
app->time = robotTime;
|
2005-01-25 11:22:29 +03:00
|
|
|
app->httpResponse = robotHTTP;
|
|
|
|
app->httpResponseBody = robotHTTPBody;
|
|
|
|
app->httpResponseHeaderName = robotHTTPHeaderName;
|
|
|
|
app->httpResponseHeaderValue = robotHTTPHeaderValue;
|
2005-01-21 00:09:40 +03:00
|
|
|
app->html = robotHTML;
|
|
|
|
app->htmlAttributeName = robotHTMLAttributeName;
|
|
|
|
app->htmlAttributeValue = robotHTMLAttributeValue;
|
2005-02-03 04:06:24 +03:00
|
|
|
app->htmlDeclaration = robotHTMLDeclaration;
|
|
|
|
app->htmlProcessingInstruction = robotHTMLProcessingInstruction;
|
|
|
|
app->htmlTag = robotHTMLTag;
|
|
|
|
app->htmlText = robotHTMLText;
|
2005-01-21 00:09:40 +03:00
|
|
|
app->contentType = robotContentType;
|
2005-01-25 11:22:29 +03:00
|
|
|
app->httpResponseCharSet = robotHTTPCharSet;
|
2005-01-21 00:09:40 +03:00
|
|
|
app->data = &robot;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
while (1)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
threadMutexLock();
|
|
|
|
while ((!currURL) && (count < LIMIT))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
app->status(app, waiting, __FILE__, __LINE__);
|
2005-01-19 20:16:09 +03:00
|
|
|
threadCondWait();
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
if (count >= LIMIT)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
threadMutexUnlock();
|
2000-02-01 21:24:20 +03:00
|
|
|
break;
|
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
count++;
|
|
|
|
if (!(count % INTERVAL))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
printStats();
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-21 00:09:40 +03:00
|
|
|
robot.count = count;
|
|
|
|
robot.url = currURL;
|
2005-01-19 20:16:09 +03:00
|
|
|
currURL = currURL->next;
|
|
|
|
threadMutexUnlock();
|
2005-01-21 00:09:40 +03:00
|
|
|
processURL(app);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
|
|
|
|
return NULL;
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
static int
|
|
|
|
allThreadsAreWaiting(void)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
int i;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
if (statusEntries[i].message != waiting)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
return i == numberOfSlots;
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
initKnownBadTags(void)
|
|
|
|
{
|
|
|
|
unsigned char **p;
|
|
|
|
|
|
|
|
knownBadTags = hashAlloc(NULL);
|
|
|
|
p = badTags;
|
|
|
|
while (*p)
|
|
|
|
{
|
|
|
|
hashAdd(knownBadTags, *p, NULL);
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static FD *
|
|
|
|
addFD(int fd, Handler func)
|
|
|
|
{
|
|
|
|
FD *f;
|
|
|
|
|
|
|
|
if (fd > maxFD)
|
|
|
|
{
|
|
|
|
if (table)
|
|
|
|
{
|
|
|
|
table = utilRealloc(table,
|
|
|
|
(maxFD + 1) * sizeof(*table),
|
|
|
|
(fd + 1) * sizeof(*table));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
table = calloc(fd + 1, sizeof(*table));
|
|
|
|
}
|
|
|
|
if (!table)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
maxFD = fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
f = malloc(sizeof(FD));
|
|
|
|
if (!f)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
f->handler = func;
|
|
|
|
f->file = NULL;
|
|
|
|
/*
|
|
|
|
f->id = -1;
|
|
|
|
f->port = 0;
|
|
|
|
f->suspend = 0;
|
|
|
|
f->writeFD = -1;
|
|
|
|
*/
|
|
|
|
|
|
|
|
table[fd] = f;
|
|
|
|
|
|
|
|
FD_SET(fd, &fdSet);
|
|
|
|
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
removeFD(int fd)
|
|
|
|
{
|
|
|
|
FD *f;
|
|
|
|
|
|
|
|
f = table[fd];
|
|
|
|
if (f)
|
|
|
|
{
|
|
|
|
FD_CLR(fd, &fdSet);
|
|
|
|
if (f->file && (fileno(f->file) == fd))
|
|
|
|
{
|
|
|
|
fclose(f->file);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
close(fd);
|
|
|
|
}
|
|
|
|
free(f);
|
|
|
|
table[fd] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
compareStatusEntries(const void *e1, const void *e2)
|
|
|
|
{
|
|
|
|
StatusEntry *entry1;
|
|
|
|
StatusEntry *entry2;
|
|
|
|
|
|
|
|
entry1 = &statusEntries[*((int *) e1)];
|
|
|
|
entry2 = &statusEntries[*((int *) e2)];
|
|
|
|
|
|
|
|
return entry1->time - entry2->time;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
readClientRequest(int fd)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
unsigned char b[10240];
|
2000-02-01 21:24:20 +03:00
|
|
|
int bytesRead;
|
|
|
|
FILE *file;
|
|
|
|
int i;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
bytesRead = recv(fd, b, sizeof(b) - 1, 0);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (bytesRead < 0)
|
|
|
|
{
|
|
|
|
if (errno != ECONNRESET)
|
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
perror("recv");
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
removeFD(fd);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else if (!bytesRead)
|
|
|
|
{
|
|
|
|
removeFD(fd);
|
|
|
|
return;
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
b[bytesRead] = 0;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
file = fdopen(fd, "w");
|
|
|
|
if (!file)
|
|
|
|
{
|
|
|
|
char *err = "fdopen failed\n";
|
2005-01-19 20:16:09 +03:00
|
|
|
send(fd, err, strlen(err), 0);
|
2000-02-01 21:24:20 +03:00
|
|
|
removeFD(fd);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
table[fd]->file = file;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
if (strstr((char *) b, "/exit"))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
char *goodbye =
|
|
|
|
"HTTP/1.0 200 OK\n"
|
|
|
|
"Content-Type: text/html\n"
|
|
|
|
"\n"
|
|
|
|
"Bye!"
|
|
|
|
;
|
|
|
|
fprintf(file, goodbye);
|
|
|
|
removeFD(fd);
|
|
|
|
exit(0);
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
else if (strstr((char *) b, "/times"))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
char *begin =
|
|
|
|
"HTTP/1.0 200 OK\n"
|
|
|
|
"Content-Type: text/html\n"
|
|
|
|
"\n"
|
|
|
|
;
|
|
|
|
|
|
|
|
fprintf(file, begin);
|
|
|
|
printTimes(file);
|
|
|
|
removeFD(fd);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
char *hello =
|
|
|
|
"HTTP/1.0 200 OK\n"
|
|
|
|
"Content-Type: text/html\n"
|
|
|
|
"\n"
|
|
|
|
;
|
|
|
|
|
|
|
|
fprintf(file, hello);
|
|
|
|
|
|
|
|
fprintf(file, "<table>\n");
|
|
|
|
|
|
|
|
fprintf(file, "<tr bgcolor=#cccccc>");
|
|
|
|
fprintf(file, "<td>Time</td>");
|
|
|
|
fprintf(file, "<td>Message</td>");
|
|
|
|
fprintf(file, "<td>File</td>");
|
|
|
|
fprintf(file, "<td>Line</td>");
|
|
|
|
fprintf(file, "</tr>");
|
|
|
|
|
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
|
|
|
{
|
|
|
|
sortedStatusEntries[i] = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
qsort(sortedStatusEntries, numberOfSlots, sizeof(int),
|
|
|
|
compareStatusEntries);
|
|
|
|
|
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
|
|
|
{
|
|
|
|
StatusEntry *entry;
|
|
|
|
|
|
|
|
entry = &statusEntries[sortedStatusEntries[i]];
|
|
|
|
fprintf(file, "<tr>");
|
|
|
|
fprintf(file, "<td>%s</td>", ctime(&entry->time));
|
|
|
|
fprintf(file, "<td>%s</td>",
|
|
|
|
entry->message ? entry->message : "NULL");
|
|
|
|
fprintf(file, "<td>%s</td>",
|
|
|
|
entry->file ? entry->file : "NULL");
|
|
|
|
fprintf(file, "<td>%d</td>", entry->line);
|
|
|
|
fprintf(file, "</tr>");
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(file, "</table>\n");
|
|
|
|
|
|
|
|
removeFD(fd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
acceptNewClient(int fd)
|
|
|
|
{
|
|
|
|
FD *f;
|
|
|
|
int newFD;
|
|
|
|
|
|
|
|
newFD = netAccept(fd);
|
|
|
|
if (newFD < 0)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "netAccept failed\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
f = addFD(newFD, readClientRequest);
|
|
|
|
if (!f)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "addFD failed\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *
|
|
|
|
startStatusFunc(void *a)
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
App *app;
|
2000-02-01 21:24:20 +03:00
|
|
|
FD *f;
|
|
|
|
int fd;
|
|
|
|
fd_set localFDSet;
|
|
|
|
int ret;
|
|
|
|
|
2005-01-21 00:09:40 +03:00
|
|
|
app = appAlloc();
|
|
|
|
fd = netListen(app, NULL, &mainPort);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (fd < 0)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "netListen failed\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
f = addFD(fd, acceptNewClient);
|
|
|
|
if (!f)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "addFD failed\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
localFDSet = fdSet;
|
|
|
|
ret = select(maxFD + 1, &localFDSet, NULL, NULL, NULL);
|
|
|
|
if (ret == -1)
|
|
|
|
{
|
|
|
|
perror("select");
|
|
|
|
}
|
|
|
|
for (fd = 0; fd <= maxFD; fd++)
|
|
|
|
{
|
|
|
|
if (FD_ISSET(fd, &localFDSet))
|
|
|
|
{
|
|
|
|
(*table[fd]->handler)(fd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
startStatusThread(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
ret = threadCreate(&statusThread, startStatusFunc, NULL);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (ret)
|
|
|
|
{
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
time_t theTime;
|
|
|
|
URL *url;
|
2005-01-19 20:16:09 +03:00
|
|
|
int ret;
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
#if !defined(WINDOWS)
|
2000-02-01 21:24:20 +03:00
|
|
|
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "signal failed\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
#endif
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
for (i = 1; i < argc; i++)
|
|
|
|
{
|
|
|
|
if (!strcmp(argv[i], "-d"))
|
|
|
|
{
|
|
|
|
i++;
|
|
|
|
limitDomains[limitDomainsIndex] = argv[i];
|
|
|
|
limitDomainsIndex++;
|
|
|
|
}
|
|
|
|
else if (!strcmp(argv[i], "-o"))
|
|
|
|
{
|
|
|
|
i++;
|
|
|
|
outFile = argv[i];
|
|
|
|
}
|
|
|
|
else if (!strcmp(argv[i], "-s"))
|
|
|
|
{
|
|
|
|
i++;
|
|
|
|
firstURL = argv[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
if (!netInit())
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (!threadInit())
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
time(&theTime);
|
|
|
|
startTime = copyString((unsigned char *) ctime(&theTime));
|
|
|
|
|
|
|
|
#ifdef ROBOT_LOG_ATTRIBUTES
|
|
|
|
attributeTable = hashAlloc(freeEntry);
|
|
|
|
#endif
|
|
|
|
contentTypeTable = hashAlloc(freeEntry);
|
|
|
|
httpCharsetTable = hashAlloc(freeEntry);
|
|
|
|
httpHeaderTable = hashAlloc(freeEntry);
|
|
|
|
metaCharsetTable = hashAlloc(freeEntry);
|
|
|
|
schemeTable = hashAlloc(freeEntry);
|
|
|
|
#ifdef ROBOT_LOG_TAGS
|
|
|
|
tagTable = hashAlloc(freeEntry);
|
|
|
|
#endif
|
|
|
|
urlTable = hashAlloc(NULL);
|
|
|
|
|
|
|
|
initKnownBadTags();
|
|
|
|
|
|
|
|
addURLInit(addURLFunc, NULL, limitDomains);
|
|
|
|
|
|
|
|
htmlRegister("meta", "content", metaHandler);
|
|
|
|
htmlRegisterTagHandler(tagHandler);
|
|
|
|
|
|
|
|
startStatusThread();
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
ret = threadCreate(&slots[i], startHere, (void *) i);
|
|
|
|
if (ret)
|
|
|
|
{
|
|
|
|
exit(0);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
threadYield();
|
|
|
|
|
|
|
|
if (!firstURL)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
firstURL = "http://mozilla.org/";
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
url = urlParse((unsigned char *) firstURL);
|
|
|
|
hashAdd(urlTable, (unsigned char *) firstURL, 0);
|
|
|
|
urls = url;
|
|
|
|
lastURL = url;
|
2005-01-19 20:16:09 +03:00
|
|
|
currURL = url;
|
|
|
|
threadCondSignal();
|
|
|
|
threadYield();
|
2000-02-01 21:24:20 +03:00
|
|
|
while (1)
|
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
if (allThreadsAreWaiting())
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
|
|
|
{
|
|
|
|
threadCancel(slots[i]);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
break;
|
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
if (count >= LIMIT)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
threadCondBroadcast();
|
|
|
|
for (i = 0; i < numberOfSlots; i++)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-19 20:16:09 +03:00
|
|
|
threadJoin(slots[i]);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-19 20:16:09 +03:00
|
|
|
threadCancel(statusThread);
|
|
|
|
break;
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
printStats();
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
#ifdef ROBOT_LOG_ATTRIBUTES
|
|
|
|
hashFree(attributeTable);
|
|
|
|
#endif
|
|
|
|
hashFree(contentTypeTable);
|
|
|
|
hashFree(httpCharsetTable);
|
|
|
|
hashFree(httpHeaderTable);
|
|
|
|
hashFree(metaCharsetTable);
|
|
|
|
hashFree(schemeTable);
|
|
|
|
#ifdef ROBOT_LOG_TAGS
|
|
|
|
hashFree(tagTable);
|
|
|
|
#endif
|
|
|
|
hashFree(urlTable);
|
|
|
|
|
|
|
|
url = urls;
|
|
|
|
while (url)
|
|
|
|
{
|
|
|
|
URL *tmp;
|
|
|
|
|
|
|
|
tmp = url;
|
|
|
|
url = url->next;
|
|
|
|
urlFree(tmp);
|
|
|
|
}
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
return 1;
|
|
|
|
}
|