2009-06-29 02:44:22 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2007 Henri Sivonen
|
2010-04-09 19:48:29 +04:00
|
|
|
* Copyright (c) 2008-2010 Mozilla Foundation
|
2009-06-29 02:44:22 +04:00
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
|
|
* DEALINGS IN THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package nu.validator.htmlparser.impl;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
2010-09-28 11:32:31 +04:00
|
|
|
import nu.validator.htmlparser.annotation.Auto;
|
2010-12-08 15:37:19 +03:00
|
|
|
import nu.validator.htmlparser.annotation.Inline;
|
2009-06-29 02:44:22 +04:00
|
|
|
import nu.validator.htmlparser.common.ByteReadable;
|
|
|
|
|
|
|
|
import org.xml.sax.SAXException;
|
|
|
|
|
|
|
|
public abstract class MetaScanner {
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Constant for "charset".
|
|
|
|
*/
|
2013-03-26 11:15:23 +04:00
|
|
|
private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
|
2009-06-29 02:44:22 +04:00
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Constant for "content".
|
|
|
|
*/
|
2013-03-26 11:15:23 +04:00
|
|
|
private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
|
2010-12-08 15:37:19 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Constant for "http-equiv".
|
|
|
|
*/
|
2013-03-26 11:15:23 +04:00
|
|
|
private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
|
|
|
|
'u', 'i', 'v' };
|
2010-12-08 15:37:19 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Constant for "content-type".
|
|
|
|
*/
|
2013-03-26 11:15:23 +04:00
|
|
|
private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
|
|
|
|
't', '-', 't', 'y', 'p', 'e' };
|
2009-06-29 02:44:22 +04:00
|
|
|
|
|
|
|
private static final int NO = 0;
|
|
|
|
|
|
|
|
private static final int M = 1;
|
|
|
|
|
|
|
|
private static final int E = 2;
|
|
|
|
|
|
|
|
private static final int T = 3;
|
|
|
|
|
|
|
|
private static final int A = 4;
|
|
|
|
|
|
|
|
private static final int DATA = 0;
|
|
|
|
|
|
|
|
private static final int TAG_OPEN = 1;
|
|
|
|
|
|
|
|
private static final int SCAN_UNTIL_GT = 2;
|
|
|
|
|
|
|
|
private static final int TAG_NAME = 3;
|
|
|
|
|
|
|
|
private static final int BEFORE_ATTRIBUTE_NAME = 4;
|
|
|
|
|
|
|
|
private static final int ATTRIBUTE_NAME = 5;
|
|
|
|
|
|
|
|
private static final int AFTER_ATTRIBUTE_NAME = 6;
|
|
|
|
|
|
|
|
private static final int BEFORE_ATTRIBUTE_VALUE = 7;
|
|
|
|
|
|
|
|
private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
|
|
|
|
|
|
|
|
private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
|
|
|
|
|
|
|
|
private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
|
|
|
|
|
|
|
|
private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
|
|
|
|
|
|
|
|
private static final int MARKUP_DECLARATION_OPEN = 13;
|
|
|
|
|
|
|
|
private static final int MARKUP_DECLARATION_HYPHEN = 14;
|
|
|
|
|
|
|
|
private static final int COMMENT_START = 15;
|
|
|
|
|
|
|
|
private static final int COMMENT_START_DASH = 16;
|
|
|
|
|
|
|
|
private static final int COMMENT = 17;
|
|
|
|
|
|
|
|
private static final int COMMENT_END_DASH = 18;
|
|
|
|
|
|
|
|
private static final int COMMENT_END = 19;
|
|
|
|
|
|
|
|
private static final int SELF_CLOSING_START_TAG = 20;
|
|
|
|
|
2010-12-08 15:37:19 +03:00
|
|
|
private static final int HTTP_EQUIV_NOT_SEEN = 0;
|
|
|
|
|
|
|
|
private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
|
|
|
|
|
|
|
|
private static final int HTTP_EQUIV_OTHER = 2;
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The data source.
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
protected ByteReadable readable;
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The state of the state machine that recognizes the tag name "meta".
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
private int metaState = NO;
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The current position in recognizing the attribute name "content".
|
|
|
|
*/
|
2010-12-08 15:37:19 +03:00
|
|
|
private int contentIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The current position in recognizing the attribute name "charset".
|
|
|
|
*/
|
2010-12-08 15:37:19 +03:00
|
|
|
private int charsetIndex = Integer.MAX_VALUE;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The current position in recognizing the attribute name "http-equive".
|
|
|
|
*/
|
|
|
|
private int httpEquivIndex = Integer.MAX_VALUE;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The current position in recognizing the attribute value "content-type".
|
|
|
|
*/
|
|
|
|
private int contentTypeIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The tokenizer state.
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
protected int stateSave = DATA;
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The currently filled length of strBuf.
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
private int strBufLen;
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Accumulation buffer for attribute values.
|
|
|
|
*/
|
2010-09-28 11:32:31 +04:00
|
|
|
private @Auto char[] strBuf;
|
2009-06-29 02:44:22 +04:00
|
|
|
|
2010-12-08 15:37:19 +03:00
|
|
|
private String content;
|
|
|
|
|
|
|
|
private String charset;
|
|
|
|
|
|
|
|
private int httpEquivState;
|
2009-06-29 02:44:22 +04:00
|
|
|
|
|
|
|
public MetaScanner() {
|
|
|
|
this.readable = null;
|
|
|
|
this.metaState = NO;
|
2010-12-08 15:37:19 +03:00
|
|
|
this.contentIndex = Integer.MAX_VALUE;
|
|
|
|
this.charsetIndex = Integer.MAX_VALUE;
|
|
|
|
this.httpEquivIndex = Integer.MAX_VALUE;
|
|
|
|
this.contentTypeIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
this.stateSave = DATA;
|
2010-12-08 15:37:19 +03:00
|
|
|
this.strBufLen = 0;
|
|
|
|
this.strBuf = new char[36];
|
|
|
|
this.content = null;
|
|
|
|
this.charset = null;
|
|
|
|
this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
@SuppressWarnings("unused") private void destructor() {
|
|
|
|
Portability.releaseString(content);
|
|
|
|
Portability.releaseString(charset);
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
|
|
|
|
// [NOCPP[
|
2009-06-29 02:44:22 +04:00
|
|
|
|
|
|
|
/**
|
2010-04-09 19:48:29 +04:00
|
|
|
* Reads a byte from the data source.
|
|
|
|
*
|
2009-06-29 02:44:22 +04:00
|
|
|
* -1 means end.
|
|
|
|
* @return
|
|
|
|
* @throws IOException
|
|
|
|
*/
|
|
|
|
protected int read() throws IOException {
|
|
|
|
return readable.readByte();
|
|
|
|
}
|
|
|
|
|
|
|
|
// ]NOCPP]
|
|
|
|
|
|
|
|
// WARNING When editing this, makes sure the bytecode length shown by javap
|
|
|
|
// stays under 8000 bytes!
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* The runs the meta scanning algorithm.
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
protected final void stateLoop(int state)
|
|
|
|
throws SAXException, IOException {
|
|
|
|
int c = -1;
|
|
|
|
boolean reconsume = false;
|
|
|
|
stateloop: for (;;) {
|
|
|
|
switch (state) {
|
|
|
|
case DATA:
|
|
|
|
dataloop: for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '<':
|
|
|
|
state = MetaScanner.TAG_OPEN;
|
|
|
|
break dataloop; // FALL THROUGH continue
|
|
|
|
// stateloop;
|
|
|
|
default:
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
|
|
|
|
case TAG_OPEN:
|
|
|
|
tagopenloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case 'm':
|
|
|
|
case 'M':
|
|
|
|
metaState = M;
|
|
|
|
state = MetaScanner.TAG_NAME;
|
|
|
|
break tagopenloop;
|
|
|
|
// continue stateloop;
|
|
|
|
case '!':
|
|
|
|
state = MetaScanner.MARKUP_DECLARATION_OPEN;
|
|
|
|
continue stateloop;
|
|
|
|
case '?':
|
|
|
|
case '/':
|
|
|
|
state = MetaScanner.SCAN_UNTIL_GT;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
|
|
|
|
metaState = NO;
|
|
|
|
state = MetaScanner.TAG_NAME;
|
|
|
|
break tagopenloop;
|
|
|
|
// continue stateloop;
|
|
|
|
}
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
reconsume = true;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALL THROUGH DON'T REORDER
|
|
|
|
case TAG_NAME:
|
|
|
|
tagnameloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
|
|
|
|
break tagnameloop;
|
|
|
|
// continue stateloop;
|
|
|
|
case '/':
|
|
|
|
state = MetaScanner.SELF_CLOSING_START_TAG;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
case 'e':
|
|
|
|
case 'E':
|
|
|
|
if (metaState == M) {
|
|
|
|
metaState = E;
|
|
|
|
} else {
|
|
|
|
metaState = NO;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case 't':
|
|
|
|
case 'T':
|
|
|
|
if (metaState == E) {
|
|
|
|
metaState = T;
|
|
|
|
} else {
|
|
|
|
metaState = NO;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case 'a':
|
|
|
|
case 'A':
|
|
|
|
if (metaState == T) {
|
|
|
|
metaState = A;
|
|
|
|
} else {
|
|
|
|
metaState = NO;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
default:
|
|
|
|
metaState = NO;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case BEFORE_ATTRIBUTE_NAME:
|
|
|
|
beforeattributenameloop: for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Consume the next input character:
|
|
|
|
*/
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
continue;
|
|
|
|
case '/':
|
|
|
|
state = MetaScanner.SELF_CLOSING_START_TAG;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
if (handleTag()) {
|
|
|
|
break stateloop;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
state = DATA;
|
|
|
|
continue stateloop;
|
|
|
|
case 'c':
|
|
|
|
case 'C':
|
|
|
|
contentIndex = 0;
|
|
|
|
charsetIndex = 0;
|
2010-12-08 15:37:19 +03:00
|
|
|
httpEquivIndex = Integer.MAX_VALUE;
|
|
|
|
contentTypeIndex = Integer.MAX_VALUE;
|
|
|
|
state = MetaScanner.ATTRIBUTE_NAME;
|
|
|
|
break beforeattributenameloop;
|
|
|
|
case 'h':
|
|
|
|
case 'H':
|
|
|
|
contentIndex = Integer.MAX_VALUE;
|
|
|
|
charsetIndex = Integer.MAX_VALUE;
|
|
|
|
httpEquivIndex = 0;
|
|
|
|
contentTypeIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.ATTRIBUTE_NAME;
|
|
|
|
break beforeattributenameloop;
|
|
|
|
default:
|
2010-12-08 15:37:19 +03:00
|
|
|
contentIndex = Integer.MAX_VALUE;
|
|
|
|
charsetIndex = Integer.MAX_VALUE;
|
|
|
|
httpEquivIndex = Integer.MAX_VALUE;
|
|
|
|
contentTypeIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.ATTRIBUTE_NAME;
|
|
|
|
break beforeattributenameloop;
|
|
|
|
// continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case ATTRIBUTE_NAME:
|
|
|
|
attributenameloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
state = MetaScanner.AFTER_ATTRIBUTE_NAME;
|
|
|
|
continue stateloop;
|
|
|
|
case '/':
|
|
|
|
state = MetaScanner.SELF_CLOSING_START_TAG;
|
|
|
|
continue stateloop;
|
|
|
|
case '=':
|
|
|
|
strBufLen = 0;
|
2010-12-08 15:37:19 +03:00
|
|
|
contentTypeIndex = 0;
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
|
|
|
|
break attributenameloop;
|
|
|
|
// continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
if (handleTag()) {
|
|
|
|
break stateloop;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
if (metaState == A) {
|
|
|
|
if (c >= 'A' && c <= 'Z') {
|
|
|
|
c += 0x20;
|
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
|
|
|
|
++contentIndex;
|
|
|
|
} else {
|
|
|
|
contentIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
|
|
|
|
++charsetIndex;
|
|
|
|
} else {
|
|
|
|
charsetIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
|
|
|
|
++httpEquivIndex;
|
|
|
|
} else {
|
|
|
|
httpEquivIndex = Integer.MAX_VALUE;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case BEFORE_ATTRIBUTE_VALUE:
|
|
|
|
beforeattributevalueloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
continue;
|
|
|
|
case '"':
|
|
|
|
state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
|
|
|
|
break beforeattributevalueloop;
|
|
|
|
// continue stateloop;
|
|
|
|
case '\'':
|
|
|
|
state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
if (handleTag()) {
|
|
|
|
break stateloop;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
2010-12-08 15:37:19 +03:00
|
|
|
handleCharInAttributeValue(c);
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
|
|
|
|
attributevaluedoublequotedloop: for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '"':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
|
|
|
|
break attributevaluedoublequotedloop;
|
|
|
|
// continue stateloop;
|
|
|
|
default:
|
2010-12-08 15:37:19 +03:00
|
|
|
handleCharInAttributeValue(c);
|
2009-06-29 02:44:22 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case AFTER_ATTRIBUTE_VALUE_QUOTED:
|
|
|
|
afterattributevaluequotedloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
|
|
|
|
continue stateloop;
|
|
|
|
case '/':
|
|
|
|
state = MetaScanner.SELF_CLOSING_START_TAG;
|
|
|
|
break afterattributevaluequotedloop;
|
|
|
|
// continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
if (handleTag()) {
|
|
|
|
break stateloop;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
|
|
|
|
reconsume = true;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case SELF_CLOSING_START_TAG:
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
if (handleTag()) {
|
|
|
|
break stateloop;
|
|
|
|
}
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
|
|
|
|
reconsume = true;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case ATTRIBUTE_VALUE_UNQUOTED:
|
|
|
|
for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
|
|
|
|
case '\u000C':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
|
|
|
if (handleTag()) {
|
2009-06-29 02:44:22 +04:00
|
|
|
break stateloop;
|
|
|
|
}
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
2010-12-08 15:37:19 +03:00
|
|
|
handleCharInAttributeValue(c);
|
2009-06-29 02:44:22 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case AFTER_ATTRIBUTE_NAME:
|
|
|
|
for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\u000C':
|
|
|
|
continue;
|
|
|
|
case '/':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.SELF_CLOSING_START_TAG;
|
|
|
|
continue stateloop;
|
|
|
|
case '=':
|
2010-12-08 15:37:19 +03:00
|
|
|
strBufLen = 0;
|
|
|
|
contentTypeIndex = 0;
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
|
|
|
if (handleTag()) {
|
2009-06-29 02:44:22 +04:00
|
|
|
break stateloop;
|
|
|
|
}
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
case 'c':
|
|
|
|
case 'C':
|
|
|
|
contentIndex = 0;
|
|
|
|
charsetIndex = 0;
|
|
|
|
state = MetaScanner.ATTRIBUTE_NAME;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
2011-09-07 13:50:15 +04:00
|
|
|
contentIndex = Integer.MAX_VALUE;
|
|
|
|
charsetIndex = Integer.MAX_VALUE;
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.ATTRIBUTE_NAME;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case MARKUP_DECLARATION_OPEN:
|
|
|
|
markupdeclarationopenloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
|
|
|
|
break markupdeclarationopenloop;
|
|
|
|
// continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.SCAN_UNTIL_GT;
|
|
|
|
reconsume = true;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case MARKUP_DECLARATION_HYPHEN:
|
|
|
|
markupdeclarationhyphenloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.COMMENT_START;
|
|
|
|
break markupdeclarationhyphenloop;
|
|
|
|
// continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.SCAN_UNTIL_GT;
|
|
|
|
reconsume = true;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case COMMENT_START:
|
|
|
|
commentstartloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.COMMENT_START_DASH;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.COMMENT;
|
|
|
|
break commentstartloop;
|
|
|
|
// continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case COMMENT:
|
|
|
|
commentloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.COMMENT_END_DASH;
|
|
|
|
break commentloop;
|
|
|
|
// continue stateloop;
|
|
|
|
default:
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case COMMENT_END_DASH:
|
|
|
|
commentenddashloop: for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.COMMENT_END;
|
|
|
|
break commentenddashloop;
|
|
|
|
// continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.COMMENT;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// FALLTHRU DON'T REORDER
|
|
|
|
case COMMENT_END:
|
|
|
|
for (;;) {
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
case '-':
|
|
|
|
continue;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.COMMENT;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case COMMENT_START_DASH:
|
|
|
|
c = read();
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '-':
|
|
|
|
state = MetaScanner.COMMENT_END;
|
|
|
|
continue stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
state = MetaScanner.COMMENT;
|
|
|
|
continue stateloop;
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case ATTRIBUTE_VALUE_SINGLE_QUOTED:
|
|
|
|
for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '\'':
|
2010-12-08 15:37:19 +03:00
|
|
|
handleAttributeValue();
|
2009-06-29 02:44:22 +04:00
|
|
|
state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
2010-12-08 15:37:19 +03:00
|
|
|
handleCharInAttributeValue(c);
|
2009-06-29 02:44:22 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// XXX reorder point
|
|
|
|
case SCAN_UNTIL_GT:
|
|
|
|
for (;;) {
|
|
|
|
if (reconsume) {
|
|
|
|
reconsume = false;
|
|
|
|
} else {
|
|
|
|
c = read();
|
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case -1:
|
|
|
|
break stateloop;
|
|
|
|
case '>':
|
|
|
|
state = MetaScanner.DATA;
|
|
|
|
continue stateloop;
|
|
|
|
default:
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
stateSave = state;
|
|
|
|
}
|
|
|
|
|
2010-12-08 15:37:19 +03:00
|
|
|
private void handleCharInAttributeValue(int c) {
|
|
|
|
if (metaState == A) {
|
|
|
|
if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
|
|
|
|
addToBuffer(c);
|
|
|
|
} else if (httpEquivIndex == HTTP_EQUIV.length) {
|
|
|
|
if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
|
|
|
|
++contentTypeIndex;
|
|
|
|
} else {
|
|
|
|
contentTypeIndex = Integer.MAX_VALUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
@Inline private int toAsciiLowerCase(int c) {
|
|
|
|
if (c >= 'A' && c <= 'Z') {
|
|
|
|
return c + 0x20;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Adds a character to the accumulation buffer.
|
|
|
|
* @param c the character to add
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
private void addToBuffer(int c) {
|
|
|
|
if (strBufLen == strBuf.length) {
|
|
|
|
char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
|
|
|
|
System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
|
|
|
|
strBuf = newBuf;
|
|
|
|
}
|
|
|
|
strBuf[strBufLen++] = (char)c;
|
|
|
|
}
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Attempts to extract a charset name from the accumulation buffer.
|
|
|
|
* @return <code>true</code> if successful
|
|
|
|
* @throws SAXException
|
|
|
|
*/
|
2010-12-08 15:37:19 +03:00
|
|
|
private void handleAttributeValue() throws SAXException {
|
|
|
|
if (metaState != A) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (contentIndex == CONTENT.length && content == null) {
|
|
|
|
content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
|
|
|
|
return;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
if (charsetIndex == CHARSET.length && charset == null) {
|
|
|
|
charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
|
|
|
|
return;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
if (httpEquivIndex == HTTP_EQUIV.length
|
|
|
|
&& httpEquivState == HTTP_EQUIV_NOT_SEEN) {
|
|
|
|
httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
|
|
|
|
: HTTP_EQUIV_OTHER;
|
|
|
|
return;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
2010-12-08 15:37:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
private boolean handleTag() throws SAXException {
|
|
|
|
boolean stop = handleTagInner();
|
|
|
|
Portability.releaseString(content);
|
|
|
|
content = null;
|
|
|
|
Portability.releaseString(charset);
|
|
|
|
charset = null;
|
|
|
|
httpEquivState = HTTP_EQUIV_NOT_SEEN;
|
|
|
|
return stop;
|
2009-06-29 02:44:22 +04:00
|
|
|
}
|
|
|
|
|
2010-12-08 15:37:19 +03:00
|
|
|
private boolean handleTagInner() throws SAXException {
|
|
|
|
if (charset != null && tryCharset(charset)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
|
|
|
|
String extract = TreeBuilder.extractCharsetFromContent(content);
|
|
|
|
if (extract == null) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
boolean success = tryCharset(extract);
|
|
|
|
Portability.releaseString(extract);
|
|
|
|
return success;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-04-09 19:48:29 +04:00
|
|
|
/**
|
|
|
|
* Tries to switch to an encoding.
|
|
|
|
*
|
|
|
|
* @param encoding
|
|
|
|
* @return <code>true</code> if successful
|
|
|
|
* @throws SAXException
|
|
|
|
*/
|
2009-06-29 02:44:22 +04:00
|
|
|
protected abstract boolean tryCharset(String encoding) throws SAXException;
|
|
|
|
|
|
|
|
}
|