diff --git a/appvalidator/python/HTMLParser.py b/appvalidator/python/HTMLParser.py
new file mode 100644
index 0000000..b336a4c
--- /dev/null
+++ b/appvalidator/python/HTMLParser.py
@@ -0,0 +1,472 @@
+"""A parser for HTML and XHTML."""
+
+# This file is based on sgmllib.py, but the API is slightly different.
+
+# XXX There should be a way to distinguish between PCDATA (parsed
+# character data -- the normal case), RCDATA (replaceable character
+# data -- only char and entity references and end tags are special)
+# and CDATA (character data -- only end tags are special).
+
+
+import markupbase
+import re
+
+# Regular expressions used for parsing
+
+interesting_normal = re.compile('[&<]')
+incomplete = re.compile('&[a-zA-Z#]')
+
+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+
+starttagopen = re.compile('<[a-zA-Z]')
+piclose = re.compile('>')
+commentclose = re.compile(r'--\s*>')
+tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
+# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
+# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
+tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
+
+attrfind = re.compile(
+ r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
+
+locatestarttagend = re.compile(r"""
+ <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
+ (?:\s*=+\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\s]* # bare value
+ )
+ )?(?:\s|/(?!>))*
+ )*
+ )?
+ \s* # trailing whitespace
+""", re.VERBOSE)
+endendtag = re.compile('>')
+# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
+# and the tag name, so maybe this should be fixed
+endtagfind = re.compile('\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+
+
+class HTMLParseError(Exception):
+ """Exception raised for all parse errors."""
+
+ def __init__(self, msg, position=(None, None)):
+ assert msg
+ self.msg = msg
+ self.lineno = position[0]
+ self.offset = position[1]
+
+ def __str__(self):
+ result = self.msg
+ if self.lineno is not None:
+ result = result + ", at line %d" % self.lineno
+ if self.offset is not None:
+ result = result + ", column %d" % (self.offset + 1)
+ return result
+
+
+class HTMLParser(markupbase.ParserBase):
+ """Find tags and other markup and call handler functions.
+
+ Usage:
+ p = HTMLParser()
+ p.feed(data)
+ ...
+ p.close()
+
+ Start tags are handled by calling self.handle_starttag() or
+ self.handle_startendtag(); end tags by self.handle_endtag(). The
+ data between tags is passed from the parser to the derived class
+ by calling self.handle_data() with the data as argument (the data
+ may be split up in arbitrary chunks). Entity references are
+ passed by calling self.handle_entityref() with the entity
+ reference as the argument. Numeric character references are
+ passed to self.handle_charref() with the string containing the
+ reference as the argument.
+ """
+
+ CDATA_CONTENT_ELEMENTS = ("script", "style")
+
+
+ def __init__(self):
+ """Initialize and reset this instance."""
+ self.reset()
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.rawdata = ''
+ self.lasttag = '???'
+ self.interesting = interesting_normal
+ self.cdata_elem = None
+ markupbase.ParserBase.reset(self)
+
+ def feed(self, data):
+ r"""Feed data to the parser.
+
+ Call this as often as you want, with as little or as much text
+ as you want (may include '\n').
+ """
+ self.rawdata = self.rawdata + data
+ self.goahead(0)
+
+ def close(self):
+ """Handle any buffered data."""
+ self.goahead(1)
+
+ def error(self, message):
+ raise HTMLParseError(message, self.getpos())
+
+ __starttag_text = None
+
+ def get_starttag_text(self):
+ """Return full source of start tag: '<...>'."""
+ return self.__starttag_text
+
+ def set_cdata_mode(self, elem):
+ self.cdata_elem = elem.lower()
+ self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
+
+ def clear_cdata_mode(self):
+ self.interesting = interesting_normal
+ self.cdata_elem = None
+
+ # Internal -- handle data as far as reasonable. May leave state
+ # and data to be processed by a subsequent call. If 'end' is
+ # true, force handling all data as if followed by EOF marker.
+ def goahead(self, end):
+ rawdata = self.rawdata
+ i = 0
+ n = len(rawdata)
+ while i < n:
+ match = self.interesting.search(rawdata, i) # < or &
+ if match:
+ j = match.start()
+ else:
+ if self.cdata_elem:
+ break
+ j = n
+ if i < j: self.handle_data(rawdata[i:j])
+ i = self.updatepos(i, j)
+ if i == n: break
+ startswith = rawdata.startswith
+ if startswith('<', i):
+ if starttagopen.match(rawdata, i): # < + letter
+ k = self.parse_starttag(i)
+ elif startswith("", i):
+ k = self.parse_endtag(i)
+ elif startswith("
-
-
-
\ No newline at end of file
diff --git a/tests/test_controlchars.py b/tests/test_controlchars.py
index 23fc6d2..95da743 100644
--- a/tests/test_controlchars.py
+++ b/tests/test_controlchars.py
@@ -35,22 +35,6 @@ class TestControlChars(TestCase):
self.assert_failed(with_warnings=True)
eq_(self.err.warnings[0]["id"][2], "syntax_error")
- def test_controlchars_utf8_ok(self):
- """Test that multi-byte characters are decoded properly (utf-8)."""
-
- self.run_test("tests/resources/controlchars/controlchars_utf-8_ok.js")
- self.assert_silent()
-
- def test_controlchars_utf8_warn(self):
- """
- Tests that multi-byte characters are decoded properly (utf-8) but remaining
- non-ASCII characters raise warnings.
- """
-
- self.run_test("tests/resources/controlchars/controlchars_utf-8_warn.js")
- self.assert_failed(with_warnings=True)
- eq_(self.err.warnings[0]["id"][2], "syntax_error")
-
@raises(JSONDecodeError)
def test_controlchar_in_webapp(self):
"""
diff --git a/tests/test_markup_markuptester.py b/tests/test_markup_markuptester.py
index 667a617..06aec46 100644
--- a/tests/test_markup_markuptester.py
+++ b/tests/test_markup_markuptester.py
@@ -156,16 +156,6 @@ def test_html_ignore_comment():
_test_xul("tests/resources/markup/markuptester/ignore_comments.html")
-def test_invalid_markup():
- "Tests an markup file that is simply broken."
-
- result = _test_xul("tests/resources/markup/markuptester/bad.xml", True)
- assert result.warnings
- result = _test_xul("tests/resources/markup/markuptester/bad_script.xml",
- False)
- assert result.notices
-
-
def test_bad_encoding():
"""Test that bad encodings don't cause the parser to fail."""
_test_xul("tests/resources/markup/encoding.txt")
diff --git a/tests/test_submain_package.py b/tests/test_submain_package.py
index 2fa0960..ffcf5e4 100644
--- a/tests/test_submain_package.py
+++ b/tests/test_submain_package.py
@@ -32,14 +32,4 @@ class TestSubmainPackage(TestCase):
with open(name) as pack:
result = submain.test_package(self.err, pack, name)
- self.assert_failed()
-
- def test_package_corrupt(self):
- "Tests the test_package function fails with a corrupt file"
-
- self.setup_err()
-
- name = "tests/resources/corrupt.xpi"
- result = submain.test_package(self.err, name, name)
-
- self.assert_failed(with_errors=True, with_warnings=True)
+ assert self.err.errors
diff --git a/tests/test_xpimanager.py b/tests/test_xpimanager.py
index 662d812..fd6ee57 100644
--- a/tests/test_xpimanager.py
+++ b/tests/test_xpimanager.py
@@ -72,15 +72,3 @@ class TestBadZipFile(TestCase):
def test_missing_file(self):
"""Tests that the XPI manager correctly reports a missing XPI file."""
ZipPackage("foo.bar")
-
- def test_corrupt_zip(self):
- """Tests that the XPI manager correctly reports a missing XPI file."""
- x = ZipPackage(get_path("corrupt.xpi"))
- try:
- x.read("install.rdf")
- except Exception:
- pass
- else:
- raise "Exception should have been raised on corrupt file access."
-
- assert "install.rdf" in x.broken_files