Updated markup parser for unicode; fixed bug 648596

This commit is contained in:
Matt Basta 2011-04-13 21:16:09 +00:00
Родитель 769e5f7313
Коммит 908287ce76
4 изменённых файлов: 63 добавлений и 33 удалений

Просмотреть файл

@ -1,24 +1,26 @@
# -*- coding: utf-8 -*-
import validator.testcases.markup.markuptester as markuptester
from validator.errorbundler import ErrorBundle
from validator.constants import *
def _do_test(path, should_fail=False, type_=None):
return _do_test_raw(open(path).read(),
path,
should_fail,
type_)
markup_file = open(path)
data = markup_file.read()
markup_file.close()
def _do_test_raw(data, path, should_fail=False, type_=None):
filename = path.split("/")[-1]
extension = filename.split(".")[-1]
err = ErrorBundle(None, True)
err = ErrorBundle()
if type_:
err.set_type(type_)
parser = markuptester.MarkupParser(err, debug=True)
parser.process(filename, data, extension)
err.print_summary(True)
print err.print_summary(verbose=True)
if should_fail:
assert err.failed()
@ -31,7 +33,7 @@ def _do_test(path, should_fail=False, type_=None):
def test_local_url_detector():
"Tests that local URLs can be detected."
err = ErrorBundle(None, True)
err = ErrorBundle()
mp = markuptester.MarkupParser(err)
tester = mp._is_url_local
@ -135,3 +137,18 @@ def test_invalid_markup():
result = _do_test("tests/resources/markup/markuptester/bad_script.xml",
False)
assert result.notices
def test_self_closing_scripts():
"""Tests that self-closing script tags are not deletrious to parsing"""
_do_test_raw("""
<foo>
<script type="text/javascript"/>
<list_item undecodable=" _ " />
<list_item />
<list_item />
</foo>
""", "foo.js")

Просмотреть файл

@ -85,7 +85,7 @@ def main():
# Print the output of the tests based on the requested format.
if args.output == "text":
print error_bundle.print_summary(verbose=args.verbose,
no_color=args.boring)
no_color=args.boring).encode("utf-8")
elif args.output == "json":
sys.stdout.write(error_bundle.render_json())

Просмотреть файл

@ -3,6 +3,7 @@ from StringIO import StringIO
from validator import decorator
from validator import submain as testendpoint_validator
from validator import unicodehelper
import validator.testcases.markup.markuptester as testendpoint_markup
import validator.testcases.markup.csstester as testendpoint_css
import validator.testcases.scripting as testendpoint_js
@ -142,9 +143,8 @@ def test_packed_packages(err, package_contents=None, xpi_package=None):
if not file_data:
continue
# Skip BOMs and the like
while not is_standard_ascii(file_data[0]):
file_data = file_data[1:]
# Convert the file data to unicode
file_data = unicodehelper.decode(file_data)
if data["extension"] == "css":
testendpoint_css.test_css_file(err,

Просмотреть файл

@ -1,4 +1,3 @@
import re
try:
from HTMLParser import HTMLParser
@ -6,6 +5,7 @@ except ImportError: # pragma: no cover
from html.parser import HTMLParser
import validator.testcases.scripting as scripting
import validator.unicodehelper as unicodehelper
from validator.testcases.markup import csstester
from validator.contextgenerator import ContextGenerator
from validator.constants import *
@ -51,7 +51,7 @@ class MarkupParser(HTMLParser):
self.xml_state = []
self.xml_buffer = []
self.reported = {}
self.reported = set()
def process(self, filename, data, extension="xul"):
"""Processes data by splitting it into individual lines, then
@ -61,7 +61,7 @@ class MarkupParser(HTMLParser):
self.filename = filename
self.extension = extension
self.reported = {}
self.reported = set()
self.context = ContextGenerator(data)
@ -100,6 +100,8 @@ class MarkupParser(HTMLParser):
try:
self.feed(line + "\n")
except UnicodeDecodeError:
raise
except Exception as inst:
if DEBUG: # pragma: no cover
print self.xml_state, inst
@ -107,8 +109,8 @@ class MarkupParser(HTMLParser):
if "markup" in self.reported:
return
if "script" in self.xml_state or (
self.debug and "testscript" in self.xml_state):
if ("script" in self.xml_state or
self.debug and "testscript" in self.xml_state):
if "script_comments" in self.reported or not self.strict:
return
self.err.notice(("testcases_markup_markuptester",
@ -122,7 +124,7 @@ class MarkupParser(HTMLParser):
self.filename,
line=self.line,
context=self.context)
self.reported["script_comments"] = True
self.reported.add("script_comments")
return
if self.strict:
@ -136,7 +138,7 @@ class MarkupParser(HTMLParser):
self.filename,
line=self.line,
context=self.context)
self.reported["markup"] = True
self.reported.add("markup")
def handle_startendtag(self, tag, attrs):
# Self closing tags don't have an end tag, so we want to
@ -154,7 +156,7 @@ class MarkupParser(HTMLParser):
self_closing = tag in SELF_CLOSING_TAGS
if DEBUG: # pragma: no cover
print self.xml_state, tag, self_closing
print "S: ", self.xml_state, tag, self_closing
# A fictional tag for testing purposes.
if tag == "xbannedxtestx":
@ -286,17 +288,19 @@ class MarkupParser(HTMLParser):
return
self.xml_state.append(tag)
self.xml_buffer.append("")
self.xml_buffer.append(unicode(""))
def handle_endtag(self, tag):
tag = tag.lower()
if DEBUG: # pragma: no cover
print tag, self.xml_state
print "E: ", tag, self.xml_state
if not self.xml_state:
if "closing_tags" in self.reported or not self.strict:
if DEBUG:
print "Unstrict; extra closing tags ------"
return
self.err.warning(("testcases_markup_markuptester",
"handle_endtag",
@ -307,16 +311,18 @@ class MarkupParser(HTMLParser):
self.filename,
line=self.line,
context=self.context)
self.reported["closing_tags"] = True
self.reported.add("closing_tags")
if DEBUG: # pragma: no cover
print "Too many closing tags ------"
return
elif "script" in self.xml_state:
elif "script" in self.xml_state[:-1]:
# If we're in a script tag, nothing else matters. Just rush
# everything possible into the xml buffer.
self._save_to_buffer("</" + tag + ">")
if DEBUG:
print "Markup as text in script ------"
return
elif tag not in self.xml_state:
@ -344,6 +350,8 @@ class MarkupParser(HTMLParser):
# classifies as a self-closing tag, we just recursively close
# down to the level of the tag we're actualy closing.
if old_state != tag and old_state in SELF_CLOSING_TAGS:
if DEBUG:
print "Self closing tag cascading down ------"
return self.handle_endtag(tag)
# If this is an XML-derived language, everything must nest
@ -365,17 +373,20 @@ class MarkupParser(HTMLParser):
if DEBUG: # pragma: no cover
print "Invalid markup nesting ------"
data_buffer = data_buffer.strip()
# Perform analysis on collected data.
if tag == "script":
scripting.test_js_snippet(self.err,
data_buffer,
self.filename,
self.line)
elif tag == "style":
csstester.test_css_file(self.err,
self.filename,
data_buffer,
self.line)
if data_buffer:
if tag == "script":
scripting.test_js_snippet(self.err,
data_buffer,
self.filename,
self.line)
elif tag == "style":
csstester.test_css_file(self.err,
self.filename,
data_buffer,
self.line)
def handle_data(self, data):
self._save_to_buffer(data)
@ -413,6 +424,8 @@ class MarkupParser(HTMLParser):
if not self.xml_buffer:
return
data = unicodehelper.decode(data)
self.xml_buffer[-1] += data
def _format_args(self, args):