From 6a5734b52e57be1f2265e4ab08118518b10cd874 Mon Sep 17 00:00:00 2001 From: "erik%vanderpoel.org" Date: Thu, 3 Feb 2005 01:06:24 +0000 Subject: [PATCH] added support for XHTML's empty elements ; added support for declarations like ; added support for processing instructions like --- webtools/web-sniffer/app.c | 12 ++ webtools/web-sniffer/app.h | 2 + webtools/web-sniffer/cgiview.c | 14 ++ webtools/web-sniffer/html.c | 259 ++++++++++++++++++--------------- webtools/web-sniffer/html.h | 4 +- webtools/web-sniffer/proxy.c | 16 ++ webtools/web-sniffer/robot.c | 34 ++--- webtools/web-sniffer/view.c | 16 ++ webtools/web-sniffer/view.h | 2 + 9 files changed, 222 insertions(+), 137 deletions(-) diff --git a/webtools/web-sniffer/app.c b/webtools/web-sniffer/app.c index 5a3c6f3471e1..a58d04f56439 100644 --- a/webtools/web-sniffer/app.c +++ b/webtools/web-sniffer/app.c @@ -44,6 +44,16 @@ appHTMLAttributeValue(App *app, HTML *html, Buf *buf) { } +static void +appHTMLDeclaration(App *app, Buf *buf) +{ +} + +static void +appHTMLProcessingInstruction(App *app, Buf *buf) +{ +} + static void appHTMLTag(App *app, HTML *html, Buf *buf) { @@ -110,6 +120,8 @@ App appDefault = appHTML, appHTMLAttributeName, appHTMLAttributeValue, + appHTMLDeclaration, + appHTMLProcessingInstruction, appHTMLTag, appHTMLText, appHTTPRequest, diff --git a/webtools/web-sniffer/app.h b/webtools/web-sniffer/app.h index 84e68c752d14..9a41e2daa3f9 100644 --- a/webtools/web-sniffer/app.h +++ b/webtools/web-sniffer/app.h @@ -43,6 +43,8 @@ struct App void (*html)(App *app, Buf *buf); void (*htmlAttributeName)(App *app, HTML *html, Buf *buf); void (*htmlAttributeValue)(App *app, HTML *html, Buf *buf); + void (*htmlDeclaration)(App *app, Buf *buf); + void (*htmlProcessingInstruction)(App *app, Buf *buf); void (*htmlTag)(App *app, HTML *html, Buf *buf); void (*htmlText)(App *app, Buf *buf); diff --git a/webtools/web-sniffer/cgiview.c b/webtools/web-sniffer/cgiview.c index 7e5a27470791..9158679ec0ee 100644 --- a/webtools/web-sniffer/cgiview.c +++ b/webtools/web-sniffer/cgiview.c @@ -84,6 +84,18 @@ cgiviewHTMLAttributeValue(App *app, HTML *html, Buf *buf) } } +static void +cgiviewHTMLDeclaration(App *app, Buf *buf) +{ + viewHTMLDeclaration(app, buf); +} + +static void +cgiviewHTMLProcessingInstruction(App *app, Buf *buf) +{ + viewHTMLProcessingInstruction(app, buf); +} + static void cgiviewHTMLTag(App *app, HTML *html, Buf *buf) { @@ -309,6 +321,8 @@ main(int argc, char *argv[]) app->html = cgiviewHTML; app->htmlAttributeName = cgiviewHTMLAttributeName; app->htmlAttributeValue = cgiviewHTMLAttributeValue; + app->htmlDeclaration = cgiviewHTMLDeclaration; + app->htmlProcessingInstruction = cgiviewHTMLProcessingInstruction; app->htmlTag = cgiviewHTMLTag; app->htmlText = cgiviewHTMLText; app->httpRequest = cgiviewHTTPRequest; diff --git a/webtools/web-sniffer/html.c b/webtools/web-sniffer/html.c index 577e38dc3e4b..dc47e0eb84f6 100644 --- a/webtools/web-sniffer/html.c +++ b/webtools/web-sniffer/html.c @@ -36,8 +36,6 @@ typedef struct HTMLState { unsigned short mask; - unsigned short saved; - unsigned short unGotten; HTML *html; } HTMLState; @@ -336,14 +334,6 @@ htmlGetByte(Buf *buf, HTMLState *state) { unsigned short c; unsigned short ret; - unsigned short tmp; - - if (state->unGotten != 256) - { - tmp = state->unGotten; - state->unGotten = 256; - return tmp; - } c = bufGetByte(buf); if (c == 256) @@ -411,17 +401,9 @@ htmlGetByte(Buf *buf, HTMLState *state) ret = c | state->mask; } - state->saved = ret; - return ret; } -static void -htmlUnGetByte(HTMLState *state) -{ - state->unGotten = state->saved; -} - static unsigned short eatWhiteSpace(Buf *buf, HTMLState *state, unsigned short c) { @@ -469,6 +451,7 @@ readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c) ( (c != 256) && (c != '>') && + (c != '/') && (c != '=') && (c != ' ') && (c != '\t') && @@ -500,6 +483,10 @@ readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c) state->html->currentAttribute = attr; attr->name = bufCopyLower(buf); app->htmlAttributeName(app, state->html, buf); + if (c == '/') + { + c = htmlGetByte(buf, state); + } if ((c == 256) || (c == '>')) { return c; @@ -508,6 +495,10 @@ readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c) { c = eatWhiteSpace(buf, state, c); } + if (c == '/') + { + c = htmlGetByte(buf, state); + } if ((c == 256) || (c == '>')) { return c; @@ -542,6 +533,7 @@ readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c) ( (c != 256) && (c != '>') && + (c != '/') && (c != ' ') && (c != '\t') && (c != '\r') && @@ -560,12 +552,21 @@ readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c) app->htmlAttributeValue(app, state->html, buf); } callHandler(app, state->html); - if (c == '>') + if (c == '/') + { + c = htmlGetByte(buf, state); + } + if ((c == 256) || (c == '>')) { return c; } } - return eatWhiteSpace(buf, state, c); + c = eatWhiteSpace(buf, state, c); + if (c == '/') + { + c = htmlGetByte(buf, state); + } + return c; } static int @@ -604,96 +605,11 @@ endTag(App *app, Buf *buf, HTMLState *state) } static unsigned short -readTag(App *app, Buf *buf, HTMLState *state) +readTag(App *app, Buf *buf, HTMLState *state, unsigned short c) { - unsigned short c; - - bufMark(buf, -1); + bufMark(buf, c == '/' ? 0 : -1); app->html(app, buf); - c = htmlGetByte(buf, state); - if (c == '!') - { - c = htmlGetByte(buf, state); - if (c == '-') - { - c = htmlGetByte(buf, state); - if (c == '-') - { - unsigned long beginningOfComment = - bufCurrent(buf); - while (1) - { - c = htmlGetByte(buf, state); - if (c == '-') - { - c = htmlGetByte(buf, state); - if (c == '-') - { - c = htmlGetByte(buf, - state); - if (c == '>') - { - return endTag(app, - buf, state); - } - else if (c == '-') - { - do - { - c = htmlGetByte(buf, state); - } while (c == '-'); - if (c == '>') - { - return endTag(app, - buf, state); - } - } - } - } - if (c == 256) - { - bufSet(buf, beginningOfComment); - while (1) - { - c = htmlGetByte(buf, state); - if (c == '>') - { - return endTag(app, buf, state); - } - else if (c == 256) - { - fprintf(stderr, - "bad comment\n"); - bufMark(buf, -1); - FREE(state->html->tag); - state->html->tag = - copyString((unsigned - char *) "!--"); - state->html->tagIsKnown = 1; - app->htmlTag(app, - state->html, buf); - return c; - } - } - } - } - } - else - { - htmlUnGetByte(state); - } - } - else - { - htmlUnGetByte(state); - } - } - else - { - htmlUnGetByte(state); - } - do { c = htmlGetByte(buf, state); @@ -702,6 +618,7 @@ readTag(App *app, Buf *buf, HTMLState *state) ( (c != 256) && (c != '>') && + (c != '/') && (c != ' ') && (c != '\t') && (c != '\r') && @@ -710,8 +627,7 @@ readTag(App *app, Buf *buf, HTMLState *state) bufMark(buf, -1); FREE(state->html->tag); state->html->tag = bufCopyLower(buf); - if (hashLookup(knownTagTable, (*state->html->tag == '/') ? - state->html->tag + 1 : state->html->tag)) + if (hashLookup(knownTagTable, state->html->tag)) { state->html->tagIsKnown = 1; } @@ -720,20 +636,28 @@ readTag(App *app, Buf *buf, HTMLState *state) state->html->tagIsKnown = 0; } app->htmlTag(app, state->html, buf); + if (c == '/') + { + c = htmlGetByte(buf, state); + } if (c == 256) { return c; } - else if (c == '>') + if (c == '>') { return endTag(app, buf, state); } c = eatWhiteSpace(buf, state, c); + if (c == '/') + { + c = htmlGetByte(buf, state); + } if (c == 256) { return c; } - else if (c == '>') + if (c == '>') { return endTag(app, buf, state); } @@ -746,6 +670,10 @@ readTag(App *app, Buf *buf, HTMLState *state) { (*tagHandler)(app, state->html); } + if (c == 256) + { + return c; + } if (c == '>') { return endTag(app, buf, state); @@ -754,6 +682,75 @@ readTag(App *app, Buf *buf, HTMLState *state) return c; } +static unsigned short +readComment(App *app, Buf *buf, HTMLState *state) +{ + unsigned short c; + unsigned long begin; + + c = htmlGetByte(buf, state); + if (c != '-') + { + do + { + c = htmlGetByte(buf, state); + } while ((c != 256) && (c != '>')); + return c; + } + begin = bufCurrent(buf); + while (1) + { + c = htmlGetByte(buf, state); + if (c == '-') + { + c = htmlGetByte(buf, state); + if (c == '-') + { + c = htmlGetByte(buf, + state); + if (c == '>') + { + return endTag(app, buf, state); + } + if (c == '-') + { + do + { + c = htmlGetByte(buf, state); + } while (c == '-'); + if (c == '>') + { + return endTag(app, buf, state); + } + } + } + } + if (c == 256) + { + bufSet(buf, begin); + while (1) + { + c = htmlGetByte(buf, state); + if (c == '>') + { + return endTag(app, buf, state); + } + if (c == 256) + { + fprintf(stderr, "bad comment\n"); + bufMark(buf, -1); + FREE(state->html->tag); + state->html->tag = copyString( + (unsigned char *) "!--"); + state->html->tagIsKnown = 1; + app->htmlTag(app, state->html, buf); + return c; + } + } + } + } +} + static unsigned short readText(App *app, Buf *buf, HTMLState *state) { @@ -817,8 +814,6 @@ htmlRead(App *app, Buf *buf, unsigned char *base) html.currentAttribute = NULL; state.mask = 0; - state.saved = 0; - state.unGotten = 256; state.html = &html; c = htmlGetByte(buf, &state); @@ -827,18 +822,48 @@ htmlRead(App *app, Buf *buf, unsigned char *base) if (c == '<') { c = htmlGetByte(buf, &state); - htmlUnGetByte(&state); if ( (('a' <= c) && (c <= 'z')) || (('A' <= c) && (c <= 'Z')) || - (c == '/') || - (c == '!') + (c == '/') ) { - c = readTag(app, buf, &state); + c = readTag(app, buf, &state, c); c = dealWithScript(buf, &state, c); } + else if (c == '!') + { + c = htmlGetByte(buf, &state); + if (c == '-') + { + c = readComment(app, buf, &state); + continue; + } + while ((c != 256) && (c != '>')) + { + c = htmlGetByte(buf, &state); + } + if (c == '>') + { + c = htmlGetByte(buf, &state); + } + bufMark(buf, -1); + app->htmlDeclaration(app, buf); + } + else if (c == '?') + { + do + { + c = htmlGetByte(buf, &state); + } while ((c != 256) && (c != '>')); + if (c == '>') + { + c = htmlGetByte(buf, &state); + } + bufMark(buf, -1); + app->htmlProcessingInstruction(app, buf); + } else { diag(__LINE__, &state, c); diff --git a/webtools/web-sniffer/html.h b/webtools/web-sniffer/html.h index e77ba864b6ce..d37a1a00cd09 100644 --- a/webtools/web-sniffer/html.h +++ b/webtools/web-sniffer/html.h @@ -39,10 +39,10 @@ typedef struct HTML unsigned char *base; unsigned char *url; unsigned char *tag; - int tagIsKnown; HTMLAttribute *attributes; HTMLAttribute *currentAttribute; - int currentAttributeIsURL; + unsigned int tagIsKnown : 1; + unsigned int currentAttributeIsURL : 1; } HTML; typedef void (*HTMLHandler)(App *app, HTML *html); diff --git a/webtools/web-sniffer/proxy.c b/webtools/web-sniffer/proxy.c index ea119c1bf19c..aa44be589f6f 100644 --- a/webtools/web-sniffer/proxy.c +++ b/webtools/web-sniffer/proxy.c @@ -49,6 +49,8 @@ typedef struct Arg static void proxyHTML(App *app, Buf *buf); static void proxyHTMLAttributeName(App *app, HTML *html, Buf *buf); static void proxyHTMLAttributeValue(App *app, HTML *html, Buf *buf); +static void proxyHTMLDeclaration(App *app, Buf *buf); +static void proxyHTMLProcessingInstruction(App *app, Buf *buf); static void proxyHTMLTag(App *app, HTML *html, Buf *buf); static void proxyHTMLText(App *app, Buf *buf); static void proxyHTTP(App *app, Buf *buf); @@ -191,6 +193,8 @@ proxyApp(FD *f) app->html = proxyHTML; app->htmlAttributeName = proxyHTMLAttributeName; app->htmlAttributeValue = proxyHTMLAttributeValue; + app->htmlDeclaration = proxyHTMLDeclaration; + app->htmlProcessingInstruction = proxyHTMLProcessingInstruction; app->htmlTag = proxyHTMLTag; app->htmlText = proxyHTMLText; app->httpResponse = proxyHTTP; @@ -554,6 +558,18 @@ proxyHTMLAttributeValue(App *app, HTML *html, Buf *buf) viewHTMLAttributeValue(app, buf); } +static void +proxyHTMLDeclaration(App *app, Buf *buf) +{ + viewHTMLDeclaration(app, buf); +} + +static void +proxyHTMLProcessingInstruction(App *app, Buf *buf) +{ + viewHTMLProcessingInstruction(app, buf); +} + static void proxyHTMLTag(App *app, HTML *html, Buf *buf) { diff --git a/webtools/web-sniffer/robot.c b/webtools/web-sniffer/robot.c index d9aec9724c17..9b50933713ec 100644 --- a/webtools/web-sniffer/robot.c +++ b/webtools/web-sniffer/robot.c @@ -265,10 +265,6 @@ addURLFunc(App *app, URL *url) static void robotHTTP(App *app, Buf *buf) { - Robot *robot; - - robot = app->data; - viewHTTP(app, buf); } @@ -295,20 +291,12 @@ robotHTTPHeaderName(App *app, Buf *buf) static void robotHTTPHeaderValue(App *app, Buf *buf, unsigned char *url) { - Robot *robot; - - robot = app->data; - viewHTTPHeaderValue(app, buf); } static void robotHTML(App *app, Buf *buf) { - Robot *robot; - - robot = app->data; - viewHTML(app, buf); } @@ -397,13 +385,21 @@ robotHTMLAttributeName(App *app, HTML *html, Buf *buf) static void robotHTMLAttributeValue(App *app, HTML *html, Buf *buf) { - Robot *robot; - - robot = app->data; - viewHTMLAttributeValue(app, buf); } +static void +robotHTMLDeclaration(App *app, Buf *buf) +{ + viewHTMLDeclaration(app, buf); +} + +static void +robotHTMLProcessingInstruction(App *app, Buf *buf) +{ + viewHTMLProcessingInstruction(app, buf); +} + static void robotContentType(App *app, unsigned char *contentType) { @@ -693,10 +689,12 @@ startHere(void *a) app->httpResponseHeaderName = robotHTTPHeaderName; app->httpResponseHeaderValue = robotHTTPHeaderValue; app->html = robotHTML; - app->htmlText = robotHTMLText; - app->htmlTag = robotHTMLTag; app->htmlAttributeName = robotHTMLAttributeName; app->htmlAttributeValue = robotHTMLAttributeValue; + app->htmlDeclaration = robotHTMLDeclaration; + app->htmlProcessingInstruction = robotHTMLProcessingInstruction; + app->htmlTag = robotHTMLTag; + app->htmlText = robotHTMLText; app->contentType = robotContentType; app->httpResponseCharSet = robotHTTPCharSet; app->data = &robot; diff --git a/webtools/web-sniffer/view.c b/webtools/web-sniffer/view.c index c92389a94c83..a924fb954d2c 100644 --- a/webtools/web-sniffer/view.c +++ b/webtools/web-sniffer/view.c @@ -177,6 +177,22 @@ viewHTMLAttributeValue(App *app, Buf *buf) fprintf(app->view.out, ""); } +void +viewHTMLDeclaration(App *app, Buf *buf) +{ + fprintf(app->view.out, ""); + print(&app->view, buf); + fprintf(app->view.out, ""); +} + +void +viewHTMLProcessingInstruction(App *app, Buf *buf) +{ + fprintf(app->view.out, ""); + print(&app->view, buf); + fprintf(app->view.out, ""); +} + void viewHTMLTag(App *app, Buf *buf) { diff --git a/webtools/web-sniffer/view.h b/webtools/web-sniffer/view.h index 4916f9d11d45..a607fb2c5c1b 100644 --- a/webtools/web-sniffer/view.h +++ b/webtools/web-sniffer/view.h @@ -36,6 +36,8 @@ typedef struct View void viewHTML(App *app, Buf *buf); void viewHTMLAttributeName(App *app, Buf *buf); void viewHTMLAttributeValue(App *app, Buf *buf); +void viewHTMLDeclaration(App *app, Buf *buf); +void viewHTMLProcessingInstruction(App *app, Buf *buf); void viewHTMLTag(App *app, Buf *buf); void viewHTMLText(App *app, Buf *buf); void viewHTTP(App *app, Buf *buf);