/* * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code is Web Sniffer. * * The Initial Developer of the Original Code is Erik van der Poel. * Portions created by Erik van der Poel are * Copyright (C) 1998,1999,2000 Erik van der Poel. * All Rights Reserved. * * Contributor(s): Bruce Robson */ #include #include #include #include #include #include "addurl.h" #include "html.h" #include "http.h" #include "io.h" #include "main.h" #include "mime.h" #include "net.h" #include "url.h" #include "utils.h" static unsigned char *emptyHTTPResponse = (unsigned char *) ""; static unsigned char *http09Response = (unsigned char *) ""; static unsigned char *locationURLWasAdded = (unsigned char *) ""; static int nonEmptyHTTPResponseCount = 0; static int http10OrGreaterCount = 0; static unsigned short readLine(Input *input, unsigned short c) { while ((c != 256) && (c != '\r') && (c != '\n')) { c = getByte(input); } if (c == '\r') { c = getByte(input); if (c == '\n') { c = getByte(input); } } else if (c == '\n') { c = getByte(input); } return c; } static unsigned short readSpaceTab(Input *input, unsigned short c) { while ((c == ' ') || (c == '\t')) { c = getByte(input); } return c; } static unsigned short readNonWhiteSpace(Input *input, unsigned short c) { while ( (c != 256) && (c != ' ') && (c != '\t') && (c != '\r') && (c != '\n') ) { c = getByte(input); } return c; } static unsigned char * httpReadHeaders(HTTP *http, void *a, Input *input, unsigned char *url) { unsigned short c; unsigned char *charset; unsigned char *contentType; int locationFound; unsigned char *name; URL *rel; ContentType *type; unsigned char *value; contentType = NULL; locationFound = 0; if (!*current(input)) { return emptyHTTPResponse; } nonEmptyHTTPResponseCount++; if (strncmp((char *) current(input), "HTTP/", 5)) { /* XXX deal with HTTP/0.9? */ return http09Response; } http10OrGreaterCount++; mark(input, 0); c = readNonWhiteSpace(input, getByte(input)); c = readSpaceTab(input, c); sscanf((char *) current(input) - 1, "%d", &http->status); c = readLine(input, c); while (1) { if (c == 256) { mark(input, 0); reportHTTP(a, input); break; } mark(input, -1); reportHTTP(a, input); if ((c == '\r') || (c == '\n')) { readLine(input, c); unGetByte(input); mark(input, 0); reportHTTP(a, input); break; } while ( (c != 256) && (c != '\r') && (c != '\n') && (c != ':') ) { c = getByte(input); } if (c != ':') { mark(input, -1); fprintf(stderr, "no colon in HTTP header \"%s\": %s\n", copy(input), url); return NULL; } mark(input, -1); reportHTTPHeaderName(a, input); name = copyLower(input); c = readSpaceTab(input, getByte(input)); mark(input, -1); reportHTTP(a, input); c = readLine(input, c); if ((c == ' ') || (c == '\t')) { do { c = readLine(input, c); } while ((c == ' ') || (c == '\t')); } c = trimTrailingWhiteSpace(input); mark(input, -1); value = copy(input); if (!strcasecmp((char *) name, "content-type")) { reportHTTPHeaderValue(a, input, NULL); type = mimeParseContentType(value); contentType = mimeGetContentType(type); charset = mimeGetContentTypeParameter(type, "charset"); if (charset) { reportHTTPCharSet(a, charset); } mimeFreeContentType(type); } else if (!strcasecmp((char *) name, "location")) { reportHTTPHeaderValue(a, input, value); /* XXX supposed to be absolute URL */ rel = urlRelative(url, value); addURL(a, rel->url); urlFree(rel); locationFound = 1; } else { reportHTTPHeaderValue(a, input, NULL); } free(name); free(value); c = readLine(input, c); mark(input, -1); reportHTTP(a, input); } if (!contentType) { if (locationFound) { return locationURLWasAdded; } } return contentType; } void httpParseRequest(HTTP *http, void *a, unsigned char *url) { unsigned short c; mark(http->input, 0); do { c = getByte(http->input); } while (c != 256); mark(http->input, -1); reportHTTP(a, http->input); } void httpParseStream(HTTP *http, void *a, unsigned char *url) { const unsigned char *begin; unsigned short c; unsigned char *contentType; begin = current(http->input); contentType = httpReadHeaders(http, a, http->input, url); http->body = current(http->input); http->bodyLen = inputLength(http->input) - (http->body - begin); if (contentType) { if ( (contentType != emptyHTTPResponse) && (contentType != http09Response) && (contentType != locationURLWasAdded) ) { reportContentType(a, contentType); if (!strcasecmp((char *) contentType, "text/html")) { htmlRead(a, http->input, url); } else { do { c = getByte(http->input); } while (c != 256); mark(http->input, -1); reportHTTPBody(a, http->input); } free(contentType); } } else { fprintf(stderr, "no Content-Type: %s\n", url); } } void httpRead(HTTP *http, void *a, int sock, unsigned char *url) { struct timeval theTime; reportStatus(a, "readStream", __FILE__, __LINE__); gettimeofday(&theTime, NULL); http->input = readStream(sock, url); reportTime(REPORT_TIME_READSTREAM, &theTime); reportStatus(a, "readStream done", __FILE__, __LINE__); httpParseStream(http, a, url); } static void httpGetObject(HTTP *http, void *a, int sock, URL *url, unsigned char **headers) { char *get; unsigned char **h; char *httpStr; get = "GET "; httpStr = " HTTP/1.0\n"; write(sock, get, strlen(get)); if (url->path) { write(sock, url->path, strlen((char *) url->path)); } if (url->params) { write(sock, url->params, strlen((char *) url->params)); } if (url->query) { write(sock, url->query, strlen((char *) url->query)); } write(sock, httpStr, strlen(httpStr)); h = headers; if (h) { while (*h) { write(sock, *h, strlen((char *) *h)); write(sock, "\n", 1); h++; } } write(sock, "\n", 1); httpRead(http, a, sock, url->url); } HTTP * httpAlloc(void) { HTTP *http; http = calloc(sizeof(HTTP), 1); if (!http) { fprintf(stderr, "cannot calloc HTTP\n"); exit(0); } return http; } void httpFree(HTTP *http) { if (http) { inputFree(http->input); free(http); } } HTTP * httpProcess(void *a, URL *url, unsigned char **headers) { HTTP *http; int port; int sock; port = -1; if (url->port == -1) { port = 80; } else { port = url->port; } if (!url->host) { fprintf(stderr, "url->host is NULL for %s\n", url->url ? (char *) url->url : ""); return NULL; } sock = netConnect(a, url->host, port); if (sock == -1) { return NULL; } http = httpAlloc(); httpGetObject(http, a, sock, url, headers); close(sock); return http; } int httpGetHTTP10OrGreaterCount(void) { return http10OrGreaterCount; } int httpGetNonEmptyHTTPResponseCount(void) { return nonEmptyHTTPResponseCount; }