Merged encoding fixes into codebase

2011-04-21 17:11:17 +00:00 · 2011-04-21 17:11:17 +00:00 · 6e2e1fd6b4
--- a/tests/resources/bug_621360.js
+++ b/tests/resources/bug_621360.js
--- a/tests/resources/controlchars.js
+++ b/tests/resources/controlchars.js
--- a/tests/resources/controlchars/controlchars_ascii_ok.js
+++ b/tests/resources/controlchars/controlchars_ascii_ok.js
@ -0,0 +1 @@
+function test() {}
--- a/tests/resources/controlchars/controlchars_ascii_warn.js
+++ b/tests/resources/controlchars/controlchars_ascii_warn.js
--- a/tests/resources/controlchars/controlchars_bad.js
+++ b/tests/resources/controlchars/controlchars_bad.js
--- a/tests/resources/controlchars/controlchars_utf-8_ok.js
+++ b/tests/resources/controlchars/controlchars_utf-8_ok.js
@ -0,0 +1 @@
+function täst() {}
--- a/tests/resources/controlchars/controlchars_utf-8_warn.js
+++ b/tests/resources/controlchars/controlchars_utf-8_warn.js
--- a/tests/resources/unicodehelper/latin_1.txt
+++ b/tests/resources/unicodehelper/latin_1.txt
@ -0,0 +1 @@
+täst
--- a/tests/resources/unicodehelper/utf-16be.txt
+++ b/tests/resources/unicodehelper/utf-16be.txt
--- a/tests/resources/unicodehelper/utf-16le.txt
+++ b/tests/resources/unicodehelper/utf-16le.txt
--- a/tests/resources/unicodehelper/utf-32be.txt
+++ b/tests/resources/unicodehelper/utf-32be.txt
--- a/tests/resources/unicodehelper/utf-32le.txt
+++ b/tests/resources/unicodehelper/utf-32le.txt
--- a/tests/resources/unicodehelper/utf-8-bom.txt
+++ b/tests/resources/unicodehelper/utf-8-bom.txt
@ -0,0 +1 @@
+täst
--- a/tests/resources/unicodehelper/utf-8.txt
+++ b/tests/resources/unicodehelper/utf-8.txt
@ -0,0 +1 @@
+täst
--- a/tests/test_bug_621360.py
+++ b/tests/test_bug_621360.py
@ -1,20 +0,0 @@
-import os
-import validator.testcases.scripting
-
-def _do_test(path):
-    "Performs a test on a JS file"
-    script = open(path).read()
-    
-    err = validator.testcases.scripting.traverser.MockBundler()
-    validator.testcases.scripting.test_js_file(err, path, script)
-
-    return err
-
-def test_control_chars():
-    "Tests that control characters throw a single error"
-
-    err = _do_test("tests/resources/bug_621360.js")
-    # There should be a single error.
-    print err.message_count
-    assert err.message_count == 1
-
--- a/tests/test_controlchars.py
+++ b/tests/test_controlchars.py
@ -0,0 +1,47 @@
+import os
+
+import validator.unicodehelper
+import validator.testcases.scripting
+
+# Originated from bug 626496
+
+def _do_test(path):
+    "Performs a test on a JS file"
+    script = validator.unicodehelper.decode(open(path, "rb").read())
+    print script.encode("ascii", "replace")
+
+    err = validator.testcases.scripting.traverser.MockBundler()
+    validator.testcases.scripting.test_js_file(err, path, script)
+
+    print err.ids
+
+    return err
+
+def test_controlchars_ascii_ok():
+    """Tests that multi-byte characters are decoded properly (utf-8)"""
+
+    errs = _do_test("tests/resources/controlchars/controlchars_ascii_ok.js")
+    assert len(errs.ids) == 0
+
+def test_controlchars_ascii_warn():
+    """Tests that multi-byte characters are decoded properly (utf-8)
+		but remaining non ascii characters raise warnings"""
+
+    errs = _do_test("tests/resources/controlchars/controlchars_ascii_warn.js")
+    assert len(errs.ids) == 1
+    assert errs.ids[0][2] == "syntax_error"
+
+def test_controlchars_utf8_ok():
+    """Tests that multi-byte characters are decoded properly (utf-8)"""
+
+    errs = _do_test("tests/resources/controlchars/controlchars_utf-8_ok.js")
+    assert len(errs.ids) == 0
+
+def test_controlchars_utf8_warn():
+    """Tests that multi-byte characters are decoded properly (utf-8)
+		but remaining non ascii characters raise warnings"""
+
+    errs = _do_test("tests/resources/controlchars/controlchars_utf-8_warn.js")
+    assert len(errs.ids) == 1
+    assert errs.ids[0][2] == "syntax_error"
+
--- a/tests/test_markup_markuptester.py
+++ b/tests/test_markup_markuptester.py
@ -1,24 +1,26 @@
+# -*- coding: utf-8 -*-
 import validator.testcases.markup.markuptester as markuptester
 from validator.errorbundler import ErrorBundle
 from validator.constants import *

 def _do_test(path, should_fail=False, type_=None):
+    return _do_test_raw(open(path).read(),
+                        path,
+                        should_fail,
+                        type_)

-    markup_file = open(path)
-    data = markup_file.read()
-    markup_file.close()
-
+def _do_test_raw(data, path, should_fail=False, type_=None):
    filename = path.split("/")[-1]
    extension = filename.split(".")[-1]

-    err = ErrorBundle(None, True)
+    err = ErrorBundle()
    if type_:
        err.set_type(type_)

    parser = markuptester.MarkupParser(err, debug=True)
    parser.process(filename, data, extension)

-    err.print_summary(True)
+    print err.print_summary(verbose=True)

    if should_fail:
        assert err.failed()
@ -31,7 +33,7 @@ def _do_test(path, should_fail=False, type_=None):
 def test_local_url_detector():
    "Tests that local URLs can be detected."

-    err = ErrorBundle(None, True)
+    err = ErrorBundle()
    mp = markuptester.MarkupParser(err)
    tester = mp._is_url_local

@ -135,3 +137,18 @@ def test_invalid_markup():
    result = _do_test("tests/resources/markup/markuptester/bad_script.xml",
                      False)
    assert result.notices
+
+
+def test_self_closing_scripts():
+    """Tests that self-closing script tags are not deletrious to parsing"""
+
+    _do_test_raw("""
+    <foo>
+        <script type="text/javascript"/>
+        <list_item undecodable=" _ " />
+        <list_item />
+        <list_item />
+    </foo>
+    """, "foo.js")
+
+
--- a/tests/test_unicodehelper.py
+++ b/tests/test_unicodehelper.py
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+import nose
+import os
+import validator.unicodehelper as unicodehelper
+
+COMPARISON = "täst".decode("utf-8")
+
+def _do_test(path):
+    "Performs a test on a JS file"
+
+    text = open(path).read()
+    utext = unicodehelper.decode(text)
+
+    print utext.encode("ascii", "backslashreplace")
+    nose.tools.eq_(utext, COMPARISON)
+
+def test_latin1():
+    "Tests utf-8 encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/latin_1.txt")
+
+def test_utf8():
+    "Tests utf-8 w/o BOM encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-8.txt")
+
+def test_utf8():
+    "Tests utf-8 with BOM encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-8-bom.txt")
+
+def test_utf16le():
+    "Tests utf-16 Little Endian encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-16le.txt")
+
+def test_utf16be():
+    "Tests utf-16 Big Endian encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-16be.txt")
+
+def test_utf32le():
+    "Tests utf-32 Little Endian encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-32le.txt")
+
+def test_utf32be():
+    "Tests utf-32 Big Endian encoding is properly decoded"
+    _do_test("tests/resources/unicodehelper/utf-32be.txt")
--- a/validator/contextgenerator.py
+++ b/validator/contextgenerator.py
@ -1,6 +1,5 @@
 from StringIO import StringIO
-
-import textfilter
+import unicodehelper


 class ContextGenerator:
@ -83,6 +82,6 @@ class ContextGenerator:
                data = "%s ..." % data[:140]

        data = "%s%s" % (raw_data[0:with_ws - line_length], data)
-        data = textfilter.filter_ascii(data)
+        data = unicodehelper.decode(data)
        return data

--- a/validator/errorbundler.py
+++ b/validator/errorbundler.py
@ -3,7 +3,7 @@ import uuid
 from StringIO import StringIO

 from outputhandlers.shellcolors import OutputHandler
-from textfilter import filter_ascii
+import unicodehelper


 class ErrorBundle(object):
@ -110,8 +110,8 @@ class ErrorBundle(object):
        else:
            message["context"] = None

-        message["message"] = filter_ascii(message["message"])
-        message["description"] = filter_ascii(message["description"])
+        message["message"] = unicodehelper.decode(message["message"])
+        message["description"] = unicodehelper.decode(message["description"])

        stack.append(message)

--- a/validator/main.py
+++ b/validator/main.py
@ -85,7 +85,7 @@ def main():
    # Print the output of the tests based on the requested format.
    if args.output == "text":
        print error_bundle.print_summary(verbose=args.verbose,
-                                         no_color=args.boring)
+                                         no_color=args.boring).encode("utf-8")
    elif args.output == "json":
        sys.stdout.write(error_bundle.render_json())

--- a/validator/testcases/content.py
+++ b/validator/testcases/content.py
@ -6,6 +6,7 @@ from StringIO import StringIO
 from validator.contextgenerator import ContextGenerator
 from validator import decorator
 from validator import submain as testendpoint_validator
+from validator import unicodehelper
 import validator.testcases.markup.markuptester as testendpoint_markup
 import validator.testcases.markup.csstester as testendpoint_css
 import validator.testcases.scripting as testendpoint_js
@ -165,9 +166,8 @@ def test_packed_packages(err, package_contents=None, xpi_package=None):
            if not file_data:
                continue

-            # Skip BOMs and the like
-            while not is_standard_ascii(file_data[0]):
-                file_data = file_data[1:]
+            # Convert the file data to unicode
+            file_data = unicodehelper.decode(file_data)

            if data["extension"] == "css":
                testendpoint_css.test_css_file(err,
--- a/validator/testcases/javascript/actions.py
+++ b/validator/testcases/javascript/actions.py
@ -291,12 +291,14 @@ def _call_expression(traverser, node):
            result = dangerous(a=args, t=t)
        if result:
            # Generate a string representation of the params
-            params = ", ".join([str(t(p).get_literal_value()) for p in args])
+            params = u", ".join([unicode(t(p).get_literal_value()) for
+                                 p in args])
            traverser.err.warning(("testcases_javascript_actions",
                                   "_call_expression",
                                   "called_dangerous_global"),
                                  "Global called in dangerous manner",
-                                  result if isinstance(result, str) else
+                                  result if isinstance(result,
+                                                       types.StringTypes) else
                                  "A global function was called using a set "
                                  "of dangerous parameters. These parameters "
                                  "have been disallowed.",
@ -418,10 +420,10 @@ def _expr_assignment(traverser, node):
        if lit_right is None:
            lit_right = 0

-        if isinstance(lit_left, (str, unicode)) or \
-           isinstance(lit_right, (str, unicode)):
-            lit_left = str(lit_left)
-            lit_right = str(lit_right)
+        if isinstance(lit_left, types.StringTypes) or \
+           isinstance(lit_right, types.StringTypes):
+            lit_left = unicode(lit_left)
+            lit_right = unicode(lit_right)

        gleft = _get_as_num(left)
        gright = _get_as_num(right)
@ -446,8 +448,8 @@ def _expr_assignment(traverser, node):
            traverser.debug_level -= 1
            return left

-        traverser._debug("ASSIGNMENT::LEFT>>%s" % str(left.is_global))
-        traverser._debug("ASSIGNMENT::RIGHT>>%s" % str(operators[token]()))
+        traverser._debug("ASSIGNMENT::LEFT>>%s" % unicode(left.is_global))
+        traverser._debug("ASSIGNMENT::RIGHT>>%s" % unicode(operators[token]()))
        left.set_value(operators[token](), traverser=traverser)
        traverser.debug_level -= 1
        return left
@ -469,7 +471,7 @@ def _expr_binary(traverser, node):
    left = traverser._traverse_node(node["left"])
    if not isinstance(left, JSWrapper):
        left = JSWrapper(left, traverser=traverser)
-    traverser._debug(str(left.dirty))
+    traverser._debug(unicode(left.dirty))

    traverser.debug_level -= 1

@ -479,7 +481,7 @@ def _expr_binary(traverser, node):
    right = traverser._traverse_node(node["right"])
    if not isinstance(right, JSWrapper):
        right = JSWrapper(right, traverser=traverser)
-    traverser._debug(str(right.dirty))
+    traverser._debug(unicode(right.dirty))

    if left.dirty:
        return left
@ -589,7 +591,7 @@ def _get_as_num(value):
        return False

    try:
-        if isinstance(value, str):
+        if isinstance(value, types.StringTypes):
            return float(value)
        elif isinstance(value, int) or isinstance(value, float):
            return value
--- a/validator/testcases/javascript/instanceactions.py
+++ b/validator/testcases/javascript/instanceactions.py
@ -9,7 +9,7 @@ traverser
 node
    the current node being evaluated
 """
-
+import types
 from jstypes import *


@ -21,10 +21,11 @@ def createElement(args, traverser, node):

    simple_args = [traverser._traverse_node(a) for a in args]

-    if str(simple_args[0].get_literal_value()).lower() == "script":
+    if unicode(simple_args[0].get_literal_value()).lower() == u"script":
        _create_script_tag(traverser)
    elif not (simple_args[0].is_literal() or
-              isinstance(simple_args[0].get_literal_value(), str)):
+              isinstance(simple_args[0].get_literal_value(),
+                         types.StringTypes)):
        _create_variable_element(traverser)


@ -36,10 +37,11 @@ def createElementNS(args, traverser, node):

    simple_args = [traverser._traverse_node(a) for a in args]

-    if "script" in str(simple_args[1].get_literal_value()).lower():
+    if "script" in unicode(simple_args[1].get_literal_value()).lower():
        _create_script_tag(traverser)
    elif not (simple_args[1].is_literal() or
-              isinstance(simple_args[1].get_literal_value(), str)):
+              isinstance(simple_args[1].get_literal_value(),
+                         types.StringTypes)):
        _create_variable_element(traverser)


@ -115,7 +117,7 @@ def setAttribute(args, traverser, node):

    simple_args = [traverser._traverse_node(a) for a in args]

-    if str(simple_args[0].get_literal_value()).lower().startswith("on"):
+    if unicode(simple_args[0].get_literal_value()).lower().startswith("on"):
        traverser.err.notice(
            err_id=("testcases_javascript_instanceactions", "setAttribute",
                        "setting_on*"),
--- a/validator/testcases/javascript/instanceproperties.py
+++ b/validator/testcases/javascript/instanceproperties.py
@ -1,10 +1,13 @@
 import re
 import types

+import jstypes

 def set_innerHTML(new_value, traverser):
    "Tests that values being assigned to innerHTML are not dangerous"

+    if not isinstance(new_value, jstypes.JSWrapper):
+        new_value = jstypes.JSWrapper(new_value, traverser=traverser)
    literal_value = new_value.get_literal_value()
    if isinstance(literal_value, types.StringTypes):
        # Static string assignments
@ -16,9 +19,11 @@ def set_innerHTML(new_value, traverser):
                err_id=("testcases_javascript_instancetypes", "set_innerHTML",
                            "event_assignment"),
                warning="Event handler assignment via innerHTML",
-                description="When assigning event handlers, innerHTML "
-                            "should never be used. Rather, use a "
-                            "proper technique, like addEventListener.",
+                description=["When assigning event handlers, innerHTML "
+                             "should never be used. Rather, use a "
+                             "proper technique, like addEventListener.",
+                             "Event handler code: %s" %
+                                literal_value.encode("ascii", "replace")],
                filename=traverser.filename,
                line=traverser.line,
                column=traverser.position,
--- a/validator/testcases/javascript/jstypes.py
+++ b/validator/testcases/javascript/jstypes.py
@ -19,7 +19,7 @@ class JSObject(object):

    def get(self, name):
        "Returns the value associated with a property name"
-        name = str(name)
+        name = unicode(name)
        return self.data[name] if name in self.data else None

    def get_literal_value(self):
@ -36,11 +36,11 @@ class JSObject(object):
        self.data[name] = value

    def has_var(self, name):
-        name = str(name)
+        name = unicode(name)
        return name in self.data

    def output(self):
-        return str(self.data)
+        return unicode(self.data)


 class JSContext(JSObject):
@ -56,7 +56,7 @@ class JSContext(JSObject):
    def output(self):
        output = {}
        for (name, item) in self.data.items():
-            output[name] = str(item)
+            output[name] = unicode(item)
        return json.dumps(output)


@ -273,7 +273,7 @@ class JSWrapper(object):

    def __str__(self):
        """Returns a textual version of the object."""
-        return str(self.get_literal_value())
+        return unicode(self.get_literal_value())


 class JSLiteral(JSObject):
@ -309,7 +309,7 @@ class JSPrototype(JSObject):

    def get(self, name):
        "Enables static analysis of `with` statements"
-        name = str(name)
+        name = unicode(name)
        output = None
        if name in self.data:
            output = self.data[name]
@ -351,7 +351,7 @@ class JSArray(JSObject):
        # Interestingly enough, this allows for things like:
        # x = [4]
        # y = x * 3 // y = 12 since x equals "4"
-        return ",".join([str(w.get_literal_value()) for w in self.elements])
+        return u",".join([unicode(w.get_literal_value()) for w in self.elements])

    def set(self, index, value, traverser=None):
        """Follow the rules of JS for creating an array"""
--- a/validator/testcases/javascript/predefinedentities.py
+++ b/validator/testcases/javascript/predefinedentities.py
@ -15,9 +15,9 @@ BANNED_IDENTIFIERS = ("newThread", )
 # "True", except the string will be outputted when the error is thrown.

 INTERFACES = {
-    "nsICategoryManager":
+    u"nsICategoryManager":
        {"value":
-            {"addCategoryEntry":
+            {u"addCategoryEntry":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -28,33 +28,33 @@ INTERFACES = {
                         "Authors of bootstrapped add-ons must take care "
                         "to cleanup any added category entries "
                         "at shutdown")}}},
-    "nsIComponentRegistrar":
+    u"nsIComponentRegistrar":
        {"value":
-            {"autoRegister":
+            {u"autoRegister":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Bootstrapped add-ons may not register "
                        "chrome manifest files"},
-             "registerFactory":
+             u"registerFactory":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to cleanup any component registrations "
                        "at shutdown"}}},
-    "nsIObserverService":
+    u"nsIObserverService":
        {"value":
-            {"addObserver":
+            {u"addObserver":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to remove any added observers "
                        "at shutdown"}}},
-    "nsIResProtocolHandler":
+    u"nsIResProtocolHandler":
        {"value":
-            {"setSubstitution":
+            {u"setSubstitution":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -64,30 +64,30 @@ INTERFACES = {
                        "Authors of bootstrapped add-ons must take care "
                        "to cleanup any added resource substitutions "
                        "at shutdown"}}},
-    "nsIStringBundleService":
+    u"nsIStringBundleService":
        {"value":
-            {"createStringBundle":
+            {u"createStringBundle":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to flush the string bundle cache at shutdown"},
-             "createExtensibleBundle":
+             u"createExtensibleBundle":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to flush the string bundle cache at shutdown"}}},
-    "nsIStyleSheetService":
+    u"nsIStyleSheetService":
        {"value":
-            {"loadAndRegisterSheet":
+            {u"loadAndRegisterSheet":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to unregister any registered stylesheets "
                        "at shutdown"}}},
-    "nsIWindowMediator":
+    u"nsIWindowMediator":
        {"value":
            {"registerNotification":
                {"dangerous":
@ -96,9 +96,9 @@ INTERFACES = {
                        "Authors of bootstrapped add-ons must take care "
                        "to remove any added observers "
                        "at shutdown"}}},
-    "nsIWindowWatcher":
+    u"nsIWindowWatcher":
        {"value":
-            {"addListener":
+            {u"addListener":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -109,126 +109,126 @@ INTERFACES = {

 # GLOBAL_ENTITIES is also representative of the `window` object.
 GLOBAL_ENTITIES = {
-    "window": {"value": lambda: GLOBAL_ENTITIES},
-    "document":
-        {"value": {"createElement":
+    u"window": {"value": lambda: GLOBAL_ENTITIES},
+    u"document":
+        {"value": {u"createElement":
                       {"dangerous":
                            lambda a, t: t(a[0]).get_literal_value()
                                                .lower() == "script"},
-                   "createElementNS":
+                   u"createElementNS":
                       {"dangerous":
                            lambda a, t: t(a[0]).get_literal_value()
                                                .lower() == "script"}}},

    # The nefariuos timeout brothers!
-    "setTimeout": {"dangerous": actions._call_settimeout},
-    "setInterval": {"dangerous": actions._call_settimeout},
+    u"setTimeout": {"dangerous": actions._call_settimeout},
+    u"setInterval": {"dangerous": actions._call_settimeout},

-    "encodeURI": {"readonly": True},
-    "decodeURI": {"readonly": True},
-    "encodeURIComponent": {"readonly": True},
-    "decodeURIComponent": {"readonly": True},
-    "escape": {"readonly": True},
-    "unescape": {"readonly": True},
-    "isFinite": {"readonly": True},
-    "isNaN": {"readonly": True},
-    "parseFloat": {"readonly": True},
-    "parseInt": {"readonly": True},
+    u"encodeURI": {"readonly": True},
+    u"decodeURI": {"readonly": True},
+    u"encodeURIComponent": {"readonly": True},
+    u"decodeURIComponent": {"readonly": True},
+    u"escape": {"readonly": True},
+    u"unescape": {"readonly": True},
+    u"isFinite": {"readonly": True},
+    u"isNaN": {"readonly": True},
+    u"parseFloat": {"readonly": True},
+    u"parseInt": {"readonly": True},

-    "eval": {"dangerous": True},
-    "Function": {"dangerous": True},
-    "Object": {"value": {"prototype": {"dangerous": True},
-                         "constructor":  # Just an experiment for now
-                             {"value": lambda: GLOBAL_ENTITIES["Function"]}}},
-    "String": {"value": {"prototype": {"dangerous": True}}},
-    "Array": {"value": {"prototype": {"dangerous": True}}},
-    "Number": {"value": {"prototype": {"dangerous": True}}},
-    "Boolean": {"value": {"prototype": {"dangerous": True}}},
-    "RegExp": {"value": {"prototype": {"dangerous": True}}},
-    "Date": {"value": {"prototype": {"dangerous": True}}},
+    u"eval": {"dangerous": True},
+    u"Function": {"dangerous": True},
+    u"Object": {"value": {u"prototype": {"dangerous": True},
+                          u"constructor":  # Just an experiment for now
+                              {"value": lambda: GLOBAL_ENTITIES["Function"]}}},
+    u"String": {"value": {u"prototype": {"dangerous": True}}},
+    u"Array": {"value": {u"prototype": {"dangerous": True}}},
+    u"Number": {"value": {u"prototype": {"dangerous": True}}},
+    u"Boolean": {"value": {u"prototype": {"dangerous": True}}},
+    u"RegExp": {"value": {u"prototype": {"dangerous": True}}},
+    u"Date": {"value": {u"prototype": {"dangerous": True}}},

-    "Math": {"readonly": True},
+    u"Math": {"readonly": True},

-    "netscape":
-        {"value": {"security":
-                       {"value": {"PrivilegeManager":
-                                      {"value": {"enablePrivilege":
+    u"netscape":
+        {"value": {u"security":
+                       {"value": {u"PrivilegeManager":
+                                      {"value": {u"enablePrivilege":
                                                     {"dangerous": True}}}}}}},

-    "navigator":
-        {"value": {"wifi": {"dangerous": True},
-                   "geolocation": {"dangerous": True}}},
+    u"navigator":
+        {"value": {u"wifi": {"dangerous": True},
+                   u"geolocation": {"dangerous": True}}},

-    "Components":
+    u"Components":
        {"readonly": True,
         "value":
-             {"classes":
+             {u"classes":
                  {"xpcom_wildcard": True,
                   "value":
-                       {"createInstance":
+                       {u"createInstance":
                           {"return": call_definitions.xpcom_constructor("createInstance")},
-                        "getService":
+                        u"getService":
                           {"return": call_definitions.xpcom_constructor("getService")}}},
              "utils":
-                  {"value": {"evalInSandbox":
+                  {"value": {u"evalInSandbox":
                                 {"dangerous": True},
-                             "import":
+                             u"import":
                                 {"dangerous":
                                      lambda a, t:
                                        a and \
-                                        str(t(a[0]).get_literal_value())
-                                                   .count("ctypes.jsm")}}},
-              "interfaces":
-                  {"value": {"nsIXMLHttpRequest":
+                                        unicode(t(a[0]).get_literal_value())
+                                            .count("ctypes.jsm")}}},
+              u"interfaces":
+                  {"value": {u"nsIXMLHttpRequest":
                                {"xpcom_map":
                                     lambda:
                                        GLOBAL_ENTITIES["XMLHttpRequest"]},
-                             "nsICategoryManager":
+                             u"nsICategoryManager":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsICategoryManager"]},
-                             "nsIComponentRegistrar":
+                             u"nsIComponentRegistrar":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIComponentRegistrar"]},
-                             "nsIObserverService":
+                             u"nsIObserverService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIObserverService"]},
-                             "nsIResProtocolHandler":
+                             u"nsIResProtocolHandler":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIResProtocolHandler"]},
-                             "nsIStyleSheetService":
+                             u"nsIStyleSheetService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIStyleSheetService"]},
-                             "nsIStringBundleService":
+                             u"nsIStringBundleService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIStringBundleService"]},
-                             "nsIWindowMediator":
+                             u"nsIWindowMediator":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIWindowMediator"]},
-                             "nsIWindowWatcher":
+                             u"nsIWindowWatcher":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIWindowWatcher"]},
-                             "nsIProcess":
+                             u"nsIProcess":
                                {"dangerous": True},
-                             "nsIDOMGeoGeolocation":
+                             u"nsIDOMGeoGeolocation":
                                {"dangerous": True},
-                             "nsIX509CertDB":
+                             u"nsIX509CertDB":
                                {"dangerous": True},
-                             "mozIJSSubScriptLoader":
+                             u"mozIJSSubScriptLoader":
                                {"dangerous": True}}}}},
-    "extensions": {"dangerous": True},
-    "xpcnativewrappers": {"dangerous": True},
+    u"extensions": {"dangerous": True},
+    u"xpcnativewrappers": {"dangerous": True},

-    "XMLHttpRequest":
+    u"XMLHttpRequest":
        {"value":
-             {"open": {"dangerous":
+             {u"open": {"dangerous":
                           # Ban syncrhonous XHR by making sure the third arg
                           # is absent and false.
                           lambda a, t:
@ -241,7 +241,7 @@ GLOBAL_ENTITIES = {
                               "connections."}}},

    # Global properties are inherently read-only, though this formalizes it.
-    "Infinity": {"readonly": True},
-    "NaN": {"readonly": True},
-    "undefined": {"readonly": True},
+    u"Infinity": {"readonly": True},
+    u"NaN": {"readonly": True},
+    u"undefined": {"readonly": True},
    }
--- a/validator/testcases/javascript/spidermonkey.py
+++ b/validator/testcases/javascript/spidermonkey.py
@ -1,3 +1,4 @@
+import codecs
 import json
 import os
 import re
@ -7,7 +8,7 @@ from cStringIO import StringIO

 from validator.constants import SPIDERMONKEY_INSTALLATION
 from validator.contextgenerator import ContextGenerator
-from validator.textfilter import *
+import validator.unicodehelper as unicodehelper

 JS_ESCAPE = re.compile("\\\\+[ux]", re.I)

@ -81,59 +82,21 @@ def prepare_code(code, err, filename):
    # slash: a character is necessary to prevent bad identifier errors
    code = JS_ESCAPE.sub("u", code)

-    encoding = None
-    try:
-        code = unicode(code)  # Make sure we can get a Unicode representation
-        code = strip_weird_chars(code, err=err, name=filename)
-    except UnicodeDecodeError:
-        # If it's not an easily decodeable encoding, detect it and decode that
-        code = filter_ascii(code)
-
+    code = unicodehelper.decode(code)
    return code


-def strip_weird_chars(chardata, err=None, name=""):
-    line_num = 1
-    out_code = StringIO()
-    has_warned_ctrlchar = False
-
-    for line in chardata.split("\n"):
-
-        charpos = 0
-        for char in line:
-            if is_standard_ascii(char):
-                out_code.write(char)
-            else:
-                if not has_warned_ctrlchar and err is not None:
-                    err.warning(("testcases_scripting",
-                                 "_get_tree",
-                                 "control_char_filter"),
-                                "Invalid control character in JS file",
-                                "An invalid character (ASCII 0-31, except CR "
-                                "and LF) has been found in a JS file. These "
-                                "are considered unsafe and should be removed.",
-                                filename=name,
-                                line=line_num,
-                                column=charpos,
-                                context=ContextGenerator(chardata))
-                has_warned_ctrlchar = True
-
-            charpos += 1
-
-        out_code.write("\n")
-        line_num += 1
-
-    return out_code.getvalue()
-
-
 def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    "Returns an AST tree of the JS passed in `code`."

    if not code:
        return None

-    temp = tempfile.NamedTemporaryFile(mode="w+", delete=False)
-    temp.write(code)
+    code = unicodehelper.decode(code)
+
+    temp = tempfile.NamedTemporaryFile(mode="w+b", delete=False)
+    #temp.write(codecs.BOM_UTF8)
+    temp.write(code.encode("utf_8"))
    temp.flush()

    data = """try{
@ -147,7 +110,7 @@ def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    }""" % json.dumps(temp.name)

    try:
-        cmd = [shell, "-e", data]
+        cmd = [shell, "-e", data, "-U"]
        try:
            shell_obj = subprocess.Popen(cmd,
                                   shell=False,
@ -171,11 +134,7 @@ def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    if not data:
        raise JSReflectException("Reflection failed")

-    try:
-        data = unicode(data)
-    except UnicodeDecodeError:
-        data = unicode(filter_ascii(data))
-
+    data = unicodehelper.decode(data)
    parsed = json.loads(data, strict=False)

    if "error" in parsed and parsed["error"]:
--- a/validator/testcases/javascript/traverser.py
+++ b/validator/testcases/javascript/traverser.py
@ -40,12 +40,14 @@ class MockBundler:
        # Increment the message counter
        self.message_count += 1

-        self.ids.append(id)
+        self.ids.append(err_id)
+
+        error = unicode(error)

        print "-" * 30
-        print error
+        print error.encode("ascii", "replace")
        print "~" * len(error)
-        if isinstance(description, str):
+        if isinstance(description, types.StringTypes):
            print description
        else:
            # Errors can have multiple lines
@ -105,12 +107,14 @@ class Traverser:
            output = data
            if isinstance(data, JSObject) or isinstance(data, JSContext):
                output = data.output()
-            print ". " * self.debug_level + output
+
+            output = unicode(output)
+            print ". " * self.debug_level + output.encode("ascii", "replace")

    def run(self, data):
        if DEBUG:
            x = open("/tmp/output.js", "w")
-            x.write(str(data))
+            x.write(unicode(data))
            x.close()

        if "type" not in data or not self._can_handle_node(data["type"]):
@ -189,7 +193,7 @@ class Traverser:
        if action is not None:
            action_result = action(self, node)
            self._debug("ACTION>>%s (%s)" %
-                    ("halt>>%s" % str(action_result) if
+                    ("halt>>%s" % unicode(action_result) if
                        action_result else
                        "continue",
                     node["type"]))
@ -350,7 +354,8 @@ class Traverser:
                                  "_build_global",
                                  "dangerous_global"),
                                 "Dangerous Global Object",
-                                 [dang if isinstance(dang, str) else
+                                 [dang if
+                                  isinstance(dang, types.StringTypes) else
                                  "A dangerous or banned global object was "
                                  "accessed by some JavaScript code.",
                                  "Accessed object: %s" % name],
--- a/validator/testcases/markup/markuptester.py
+++ b/validator/testcases/markup/markuptester.py
@ -1,4 +1,3 @@
-
 import re
 try:
    from HTMLParser import HTMLParser
@ -6,6 +5,7 @@ except ImportError:  # pragma: no cover
    from html.parser import HTMLParser

 import validator.testcases.scripting as scripting
+import validator.unicodehelper as unicodehelper
 from validator.testcases.markup import csstester
 from validator.contextgenerator import ContextGenerator
 from validator.constants import *
@ -51,7 +51,7 @@ class MarkupParser(HTMLParser):
        self.xml_state = []
        self.xml_buffer = []

-        self.reported = {}
+        self.reported = set()

    def process(self, filename, data, extension="xul"):
        """Processes data by splitting it into individual lines, then
@ -61,7 +61,7 @@ class MarkupParser(HTMLParser):
        self.filename = filename
        self.extension = extension

-        self.reported = {}
+        self.reported = set()

        self.context = ContextGenerator(data)

@ -100,6 +100,8 @@ class MarkupParser(HTMLParser):

        try:
            self.feed(line + "\n")
+        except UnicodeDecodeError:
+            raise
        except Exception as inst:
            if DEBUG:  # pragma: no cover
                print self.xml_state, inst
@ -107,8 +109,8 @@ class MarkupParser(HTMLParser):
            if "markup" in self.reported:
                return

-            if "script" in self.xml_state or (
-               self.debug and "testscript" in self.xml_state):
+            if ("script" in self.xml_state or
+                self.debug and "testscript" in self.xml_state):
                if "script_comments" in self.reported or not self.strict:
                    return
                self.err.notice(("testcases_markup_markuptester",
@ -122,7 +124,7 @@ class MarkupParser(HTMLParser):
                                self.filename,
                                line=self.line,
                                context=self.context)
-                self.reported["script_comments"] = True
+                self.reported.add("script_comments")
                return

            if self.strict:
@ -136,7 +138,7 @@ class MarkupParser(HTMLParser):
                                 self.filename,
                                 line=self.line,
                                 context=self.context)
-            self.reported["markup"] = True
+            self.reported.add("markup")

    def handle_startendtag(self, tag, attrs):
        # Self closing tags don't have an end tag, so we want to
@ -154,7 +156,7 @@ class MarkupParser(HTMLParser):
            self_closing = tag in SELF_CLOSING_TAGS

        if DEBUG:  # pragma: no cover
-            print self.xml_state, tag, self_closing
+            print "S: ", self.xml_state, tag, self_closing

        # A fictional tag for testing purposes.
        if tag == "xbannedxtestx":
@ -286,17 +288,19 @@ class MarkupParser(HTMLParser):
            return

        self.xml_state.append(tag)
-        self.xml_buffer.append("")
+        self.xml_buffer.append(unicode(""))

    def handle_endtag(self, tag):

        tag = tag.lower()

        if DEBUG:  # pragma: no cover
-            print tag, self.xml_state
+            print "E: ", tag, self.xml_state

        if not self.xml_state:
            if "closing_tags" in self.reported or not self.strict:
+                if DEBUG:
+                    print "Unstrict; extra closing tags ------"
                return
            self.err.warning(("testcases_markup_markuptester",
                              "handle_endtag",
@ -307,16 +311,18 @@ class MarkupParser(HTMLParser):
                             self.filename,
                             line=self.line,
                             context=self.context)
-            self.reported["closing_tags"] = True
+            self.reported.add("closing_tags")
            if DEBUG:  # pragma: no cover
                print "Too many closing tags ------"
            return

-        elif "script" in self.xml_state:
+        elif "script" in self.xml_state[:-1]:
            # If we're in a script tag, nothing else matters. Just rush
            # everything possible into the xml buffer.

            self._save_to_buffer("</" + tag + ">")
+            if DEBUG:
+                print "Markup as text in script ------"
            return

        elif tag not in self.xml_state:
@ -344,6 +350,8 @@ class MarkupParser(HTMLParser):
        # classifies as a self-closing tag, we just recursively close
        # down to the level of the tag we're actualy closing.
        if old_state != tag and old_state in SELF_CLOSING_TAGS:
+            if DEBUG:
+                print "Self closing tag cascading down ------"
            return self.handle_endtag(tag)

        # If this is an XML-derived language, everything must nest
@ -365,17 +373,20 @@ class MarkupParser(HTMLParser):
            if DEBUG:  # pragma: no cover
                print "Invalid markup nesting ------"

+        data_buffer = data_buffer.strip()
+
        # Perform analysis on collected data.
-        if tag == "script":
-            scripting.test_js_snippet(self.err,
-                                      data_buffer,
-                                      self.filename,
-                                      self.line)
-        elif tag == "style":
-            csstester.test_css_file(self.err,
-                                    self.filename,
-                                    data_buffer,
-                                    self.line)
+        if data_buffer:
+            if tag == "script":
+                scripting.test_js_snippet(self.err,
+                                          data_buffer,
+                                          self.filename,
+                                          self.line)
+            elif tag == "style":
+                csstester.test_css_file(self.err,
+                                        self.filename,
+                                        data_buffer,
+                                        self.line)

    def handle_data(self, data):
        self._save_to_buffer(data)
@ -413,6 +424,8 @@ class MarkupParser(HTMLParser):
        if not self.xml_buffer:
            return

+        data = unicodehelper.decode(data)
+
        self.xml_buffer[-1] += data

    def _format_args(self, args):
--- a/validator/unicodehelper.py
+++ b/validator/unicodehelper.py
@ -0,0 +1,55 @@
+import codecs
+import textfilter
+
+# Many thanks to nmaier for inspiration and code in this module
+
+UNICODES = [
+    (codecs.BOM_UTF8, "utf-8"),
+    (codecs.BOM_UTF32_LE, "utf-32-le"),
+    (codecs.BOM_UTF32_BE, "utf-32-be"),
+    (codecs.BOM_UTF16_LE, "utf-16-le"),
+    (codecs.BOM_UTF16_BE, "utf-16-be"),
+    ]
+
+COMMON_ENCODINGS = ("utf-16", "latin_1", "ascii")
+
+def decode(data):
+    """
+    Decode data employing some charset detection and including unicode BOM
+    stripping.
+    """
+
+    # Don't make more work than we have to.
+    if not isinstance(data, str):
+        return data
+
+    # Detect standard unicodes.
+    for bom, encoding in UNICODES:
+        if data.startswith(bom):
+            return unicode(data[len(bom):], encoding, "ignore")
+
+    # Try straight UTF-8
+    try:
+        return unicode(data, "utf-8")
+    except:
+        pass
+
+    # Test for latin_1, because it can be matched as UTF-16
+    # Somewhat of a hack, but it works and is about a thousand times faster
+    # than using chardet.
+    if all(ord(c) < 256 for c in data):
+        try:
+            return unicode(data, "latin_1")
+        except:
+            pass
+
+    # Test for various common encodings.
+    for encoding in COMMON_ENCODINGS:
+        try:
+            return unicode(data, encoding)
+        except UnicodeDecodeError:
+            pass
+
+    # Anything else gets filtered.
+    return unicode(textfilter.filter_ascii(data), errors="replace")
+