Merged encoding fixes into codebase

2011-04-21 17:11:17 +00:00 · 2011-04-21 17:11:17 +00:00 · 6e2e1fd6b4
--- a/tests/resources/bug_621360.js
+++ b/tests/resources/bug_621360.js
--- a/tests/resources/controlchars.js
+++ b/tests/resources/controlchars.js
--- a/tests/resources/controlchars/controlchars_ascii_ok.js
+++ b/tests/resources/controlchars/controlchars_ascii_ok.js
@ -0,0 +1 @@
 function test() {}
--- a/tests/resources/controlchars/controlchars_ascii_warn.js
+++ b/tests/resources/controlchars/controlchars_ascii_warn.js
--- a/tests/resources/controlchars/controlchars_bad.js
+++ b/tests/resources/controlchars/controlchars_bad.js
--- a/tests/resources/controlchars/controlchars_utf-8_ok.js
+++ b/tests/resources/controlchars/controlchars_utf-8_ok.js
@ -0,0 +1 @@
 function täst() {}
--- a/tests/resources/controlchars/controlchars_utf-8_warn.js
+++ b/tests/resources/controlchars/controlchars_utf-8_warn.js
--- a/tests/resources/unicodehelper/latin_1.txt
+++ b/tests/resources/unicodehelper/latin_1.txt
@ -0,0 +1 @@
 täst
--- a/tests/resources/unicodehelper/utf-16be.txt
+++ b/tests/resources/unicodehelper/utf-16be.txt
--- a/tests/resources/unicodehelper/utf-16le.txt
+++ b/tests/resources/unicodehelper/utf-16le.txt
--- a/tests/resources/unicodehelper/utf-32be.txt
+++ b/tests/resources/unicodehelper/utf-32be.txt
--- a/tests/resources/unicodehelper/utf-32le.txt
+++ b/tests/resources/unicodehelper/utf-32le.txt
--- a/tests/resources/unicodehelper/utf-8-bom.txt
+++ b/tests/resources/unicodehelper/utf-8-bom.txt
@ -0,0 +1 @@
 täst
--- a/tests/resources/unicodehelper/utf-8.txt
+++ b/tests/resources/unicodehelper/utf-8.txt
@ -0,0 +1 @@
 täst
--- a/tests/test_bug_621360.py
+++ b/tests/test_bug_621360.py
@ -1,20 +0,0 @@
 import os
 import validator.testcases.scripting
 def _do_test(path):
    "Performs a test on a JS file"
    script = open(path).read()
    err = validator.testcases.scripting.traverser.MockBundler()
    validator.testcases.scripting.test_js_file(err, path, script)
    return err
 def test_control_chars():
    "Tests that control characters throw a single error"
    err = _do_test("tests/resources/bug_621360.js")
    # There should be a single error.
    print err.message_count
    assert err.message_count == 1
--- a/tests/test_controlchars.py
+++ b/tests/test_controlchars.py
@ -0,0 +1,47 @@
 import os
 import validator.unicodehelper
 import validator.testcases.scripting
 # Originated from bug 626496
 def _do_test(path):
    "Performs a test on a JS file"
    script = validator.unicodehelper.decode(open(path, "rb").read())
    print script.encode("ascii", "replace")
    err = validator.testcases.scripting.traverser.MockBundler()
    validator.testcases.scripting.test_js_file(err, path, script)
    print err.ids
    return err
 def test_controlchars_ascii_ok():
    """Tests that multi-byte characters are decoded properly (utf-8)"""
    errs = _do_test("tests/resources/controlchars/controlchars_ascii_ok.js")
    assert len(errs.ids) == 0
 def test_controlchars_ascii_warn():
    """Tests that multi-byte characters are decoded properly (utf-8)
 		but remaining non ascii characters raise warnings"""
    errs = _do_test("tests/resources/controlchars/controlchars_ascii_warn.js")
    assert len(errs.ids) == 1
    assert errs.ids[0][2] == "syntax_error"
 def test_controlchars_utf8_ok():
    """Tests that multi-byte characters are decoded properly (utf-8)"""
    errs = _do_test("tests/resources/controlchars/controlchars_utf-8_ok.js")
    assert len(errs.ids) == 0
 def test_controlchars_utf8_warn():
    """Tests that multi-byte characters are decoded properly (utf-8)
 		but remaining non ascii characters raise warnings"""
    errs = _do_test("tests/resources/controlchars/controlchars_utf-8_warn.js")
    assert len(errs.ids) == 1
    assert errs.ids[0][2] == "syntax_error"
--- a/tests/test_markup_markuptester.py
+++ b/tests/test_markup_markuptester.py
@ -1,24 +1,26 @@
 # -*- coding: utf-8 -*-
 import validator.testcases.markup.markuptester as markuptester
 from validator.errorbundler import ErrorBundle
 from validator.constants import *
 def _do_test(path, should_fail=False, type_=None):
    return _do_test_raw(open(path).read(),
                        path,
                        should_fail,
                        type_)
-    markup_file = open(path)
+def _do_test_raw(data, path, should_fail=False, type_=None):
    data = markup_file.read()
    markup_file.close()
    filename = path.split("/")[-1]
    extension = filename.split(".")[-1]
-    err = ErrorBundle(None, True)
+    err = ErrorBundle()
    if type_:
        err.set_type(type_)
    parser = markuptester.MarkupParser(err, debug=True)
    parser.process(filename, data, extension)
-    err.print_summary(True)
+    print err.print_summary(verbose=True)
    if should_fail:
        assert err.failed()
@ -31,7 +33,7 @@ def _do_test(path, should_fail=False, type_=None):
 def test_local_url_detector():
    "Tests that local URLs can be detected."
-    err = ErrorBundle(None, True)
+    err = ErrorBundle()
    mp = markuptester.MarkupParser(err)
    tester = mp._is_url_local
@ -135,3 +137,18 @@ def test_invalid_markup():
    result = _do_test("tests/resources/markup/markuptester/bad_script.xml",
                      False)
    assert result.notices
 def test_self_closing_scripts():
    """Tests that self-closing script tags are not deletrious to parsing"""
    _do_test_raw("""
    <foo>
        <script type="text/javascript"/>
        <list_item undecodable=" _ " />
        <list_item />
        <list_item />
    </foo>
    """, "foo.js")
--- a/tests/test_unicodehelper.py
+++ b/tests/test_unicodehelper.py
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 import nose
 import os
 import validator.unicodehelper as unicodehelper
 COMPARISON = "täst".decode("utf-8")
 def _do_test(path):
    "Performs a test on a JS file"
    text = open(path).read()
    utext = unicodehelper.decode(text)
    print utext.encode("ascii", "backslashreplace")
    nose.tools.eq_(utext, COMPARISON)
 def test_latin1():
    "Tests utf-8 encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/latin_1.txt")
 def test_utf8():
    "Tests utf-8 w/o BOM encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-8.txt")
 def test_utf8():
    "Tests utf-8 with BOM encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-8-bom.txt")
 def test_utf16le():
    "Tests utf-16 Little Endian encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-16le.txt")
 def test_utf16be():
    "Tests utf-16 Big Endian encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-16be.txt")
 def test_utf32le():
    "Tests utf-32 Little Endian encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-32le.txt")
 def test_utf32be():
    "Tests utf-32 Big Endian encoding is properly decoded"
    _do_test("tests/resources/unicodehelper/utf-32be.txt")
--- a/validator/contextgenerator.py
+++ b/validator/contextgenerator.py
@ -1,6 +1,5 @@
 from StringIO import StringIO
-
+import unicodehelper
 import textfilter
 class ContextGenerator:
@ -83,6 +82,6 @@ class ContextGenerator:
                data = "%s ..." % data[:140]
        data = "%s%s" % (raw_data[0:with_ws - line_length], data)
-        data = textfilter.filter_ascii(data)
+        data = unicodehelper.decode(data)
        return data
--- a/validator/errorbundler.py
+++ b/validator/errorbundler.py
@ -3,7 +3,7 @@ import uuid
 from StringIO import StringIO
 from outputhandlers.shellcolors import OutputHandler
-from textfilter import filter_ascii
+import unicodehelper
 class ErrorBundle(object):
@ -110,8 +110,8 @@ class ErrorBundle(object):
        else:
            message["context"] = None
-        message["message"] = filter_ascii(message["message"])
+        message["message"] = unicodehelper.decode(message["message"])
-        message["description"] = filter_ascii(message["description"])
+        message["description"] = unicodehelper.decode(message["description"])
        stack.append(message)
--- a/validator/main.py
+++ b/validator/main.py
@ -85,7 +85,7 @@ def main():
    # Print the output of the tests based on the requested format.
    if args.output == "text":
        print error_bundle.print_summary(verbose=args.verbose,
-                                         no_color=args.boring)
+                                         no_color=args.boring).encode("utf-8")
    elif args.output == "json":
        sys.stdout.write(error_bundle.render_json())
--- a/validator/testcases/content.py
+++ b/validator/testcases/content.py
@ -6,6 +6,7 @@ from StringIO import StringIO
 from validator.contextgenerator import ContextGenerator
 from validator import decorator
 from validator import submain as testendpoint_validator
 from validator import unicodehelper
 import validator.testcases.markup.markuptester as testendpoint_markup
 import validator.testcases.markup.csstester as testendpoint_css
 import validator.testcases.scripting as testendpoint_js
@ -165,9 +166,8 @@ def test_packed_packages(err, package_contents=None, xpi_package=None):
            if not file_data:
                continue
-            # Skip BOMs and the like
+            # Convert the file data to unicode
-            while not is_standard_ascii(file_data[0]):
+            file_data = unicodehelper.decode(file_data)
                file_data = file_data[1:]
            if data["extension"] == "css":
                testendpoint_css.test_css_file(err,
--- a/validator/testcases/javascript/actions.py
+++ b/validator/testcases/javascript/actions.py
@ -291,12 +291,14 @@ def _call_expression(traverser, node):
            result = dangerous(a=args, t=t)
        if result:
            # Generate a string representation of the params
-            params = ", ".join([str(t(p).get_literal_value()) for p in args])
+            params = u", ".join([unicode(t(p).get_literal_value()) for
                                 p in args])
            traverser.err.warning(("testcases_javascript_actions",
                                   "_call_expression",
                                   "called_dangerous_global"),
                                  "Global called in dangerous manner",
-                                  result if isinstance(result, str) else
+                                  result if isinstance(result,
                                                       types.StringTypes) else
                                  "A global function was called using a set "
                                  "of dangerous parameters. These parameters "
                                  "have been disallowed.",
@ -418,10 +420,10 @@ def _expr_assignment(traverser, node):
        if lit_right is None:
            lit_right = 0
-        if isinstance(lit_left, (str, unicode)) or \
+        if isinstance(lit_left, types.StringTypes) or \
-           isinstance(lit_right, (str, unicode)):
+           isinstance(lit_right, types.StringTypes):
-            lit_left = str(lit_left)
+            lit_left = unicode(lit_left)
-            lit_right = str(lit_right)
+            lit_right = unicode(lit_right)
        gleft = _get_as_num(left)
        gright = _get_as_num(right)
@ -446,8 +448,8 @@ def _expr_assignment(traverser, node):
            traverser.debug_level -= 1
            return left
-        traverser._debug("ASSIGNMENT::LEFT>>%s" % str(left.is_global))
+        traverser._debug("ASSIGNMENT::LEFT>>%s" % unicode(left.is_global))
-        traverser._debug("ASSIGNMENT::RIGHT>>%s" % str(operators[token]()))
+        traverser._debug("ASSIGNMENT::RIGHT>>%s" % unicode(operators[token]()))
        left.set_value(operators[token](), traverser=traverser)
        traverser.debug_level -= 1
        return left
@ -469,7 +471,7 @@ def _expr_binary(traverser, node):
    left = traverser._traverse_node(node["left"])
    if not isinstance(left, JSWrapper):
        left = JSWrapper(left, traverser=traverser)
-    traverser._debug(str(left.dirty))
+    traverser._debug(unicode(left.dirty))
    traverser.debug_level -= 1
@ -479,7 +481,7 @@ def _expr_binary(traverser, node):
    right = traverser._traverse_node(node["right"])
    if not isinstance(right, JSWrapper):
        right = JSWrapper(right, traverser=traverser)
-    traverser._debug(str(right.dirty))
+    traverser._debug(unicode(right.dirty))
    if left.dirty:
        return left
@ -589,7 +591,7 @@ def _get_as_num(value):
        return False
    try:
-        if isinstance(value, str):
+        if isinstance(value, types.StringTypes):
            return float(value)
        elif isinstance(value, int) or isinstance(value, float):
            return value
--- a/validator/testcases/javascript/instanceactions.py
+++ b/validator/testcases/javascript/instanceactions.py
@ -9,7 +9,7 @@ traverser
 node
    the current node being evaluated
 """
-
+import types
 from jstypes import *
@ -21,10 +21,11 @@ def createElement(args, traverser, node):
    simple_args = [traverser._traverse_node(a) for a in args]
-    if str(simple_args[0].get_literal_value()).lower() == "script":
+    if unicode(simple_args[0].get_literal_value()).lower() == u"script":
        _create_script_tag(traverser)
    elif not (simple_args[0].is_literal() or
-              isinstance(simple_args[0].get_literal_value(), str)):
+              isinstance(simple_args[0].get_literal_value(),
                         types.StringTypes)):
        _create_variable_element(traverser)
@ -36,10 +37,11 @@ def createElementNS(args, traverser, node):
    simple_args = [traverser._traverse_node(a) for a in args]
-    if "script" in str(simple_args[1].get_literal_value()).lower():
+    if "script" in unicode(simple_args[1].get_literal_value()).lower():
        _create_script_tag(traverser)
    elif not (simple_args[1].is_literal() or
-              isinstance(simple_args[1].get_literal_value(), str)):
+              isinstance(simple_args[1].get_literal_value(),
                         types.StringTypes)):
        _create_variable_element(traverser)
@ -115,7 +117,7 @@ def setAttribute(args, traverser, node):
    simple_args = [traverser._traverse_node(a) for a in args]
-    if str(simple_args[0].get_literal_value()).lower().startswith("on"):
+    if unicode(simple_args[0].get_literal_value()).lower().startswith("on"):
        traverser.err.notice(
            err_id=("testcases_javascript_instanceactions", "setAttribute",
                        "setting_on*"),
--- a/validator/testcases/javascript/instanceproperties.py
+++ b/validator/testcases/javascript/instanceproperties.py
@ -1,10 +1,13 @@
 import re
 import types
 import jstypes
 def set_innerHTML(new_value, traverser):
    "Tests that values being assigned to innerHTML are not dangerous"
    if not isinstance(new_value, jstypes.JSWrapper):
        new_value = jstypes.JSWrapper(new_value, traverser=traverser)
    literal_value = new_value.get_literal_value()
    if isinstance(literal_value, types.StringTypes):
        # Static string assignments
@ -16,9 +19,11 @@ def set_innerHTML(new_value, traverser):
                err_id=("testcases_javascript_instancetypes", "set_innerHTML",
                            "event_assignment"),
                warning="Event handler assignment via innerHTML",
-                description="When assigning event handlers, innerHTML "
+                description=["When assigning event handlers, innerHTML "
                             "should never be used. Rather, use a "
                             "proper technique, like addEventListener.",
                             "Event handler code: %s" %
                                literal_value.encode("ascii", "replace")],
                filename=traverser.filename,
                line=traverser.line,
                column=traverser.position,
--- a/validator/testcases/javascript/jstypes.py
+++ b/validator/testcases/javascript/jstypes.py
@ -19,7 +19,7 @@ class JSObject(object):
    def get(self, name):
        "Returns the value associated with a property name"
-        name = str(name)
+        name = unicode(name)
        return self.data[name] if name in self.data else None
    def get_literal_value(self):
@ -36,11 +36,11 @@ class JSObject(object):
        self.data[name] = value
    def has_var(self, name):
-        name = str(name)
+        name = unicode(name)
        return name in self.data
    def output(self):
-        return str(self.data)
+        return unicode(self.data)
 class JSContext(JSObject):
@ -56,7 +56,7 @@ class JSContext(JSObject):
    def output(self):
        output = {}
        for (name, item) in self.data.items():
-            output[name] = str(item)
+            output[name] = unicode(item)
        return json.dumps(output)
@ -273,7 +273,7 @@ class JSWrapper(object):
    def __str__(self):
        """Returns a textual version of the object."""
-        return str(self.get_literal_value())
+        return unicode(self.get_literal_value())
 class JSLiteral(JSObject):
@ -309,7 +309,7 @@ class JSPrototype(JSObject):
    def get(self, name):
        "Enables static analysis of `with` statements"
-        name = str(name)
+        name = unicode(name)
        output = None
        if name in self.data:
            output = self.data[name]
@ -351,7 +351,7 @@ class JSArray(JSObject):
        # Interestingly enough, this allows for things like:
        # x = [4]
        # y = x * 3 // y = 12 since x equals "4"
-        return ",".join([str(w.get_literal_value()) for w in self.elements])
+        return u",".join([unicode(w.get_literal_value()) for w in self.elements])
    def set(self, index, value, traverser=None):
        """Follow the rules of JS for creating an array"""
--- a/validator/testcases/javascript/predefinedentities.py
+++ b/validator/testcases/javascript/predefinedentities.py
@ -15,9 +15,9 @@ BANNED_IDENTIFIERS = ("newThread", )
 # "True", except the string will be outputted when the error is thrown.
 INTERFACES = {
-    "nsICategoryManager":
+    u"nsICategoryManager":
        {"value":
-            {"addCategoryEntry":
+            {u"addCategoryEntry":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -28,33 +28,33 @@ INTERFACES = {
                         "Authors of bootstrapped add-ons must take care "
                         "to cleanup any added category entries "
                         "at shutdown")}}},
-    "nsIComponentRegistrar":
+    u"nsIComponentRegistrar":
        {"value":
-            {"autoRegister":
+            {u"autoRegister":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Bootstrapped add-ons may not register "
                        "chrome manifest files"},
-             "registerFactory":
+             u"registerFactory":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to cleanup any component registrations "
                        "at shutdown"}}},
-    "nsIObserverService":
+    u"nsIObserverService":
        {"value":
-            {"addObserver":
+            {u"addObserver":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to remove any added observers "
                        "at shutdown"}}},
-    "nsIResProtocolHandler":
+    u"nsIResProtocolHandler":
        {"value":
-            {"setSubstitution":
+            {u"setSubstitution":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -64,30 +64,30 @@ INTERFACES = {
                        "Authors of bootstrapped add-ons must take care "
                        "to cleanup any added resource substitutions "
                        "at shutdown"}}},
-    "nsIStringBundleService":
+    u"nsIStringBundleService":
        {"value":
-            {"createStringBundle":
+            {u"createStringBundle":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to flush the string bundle cache at shutdown"},
-             "createExtensibleBundle":
+             u"createExtensibleBundle":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to flush the string bundle cache at shutdown"}}},
-    "nsIStyleSheetService":
+    u"nsIStyleSheetService":
        {"value":
-            {"loadAndRegisterSheet":
+            {u"loadAndRegisterSheet":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
                        "Authors of bootstrapped add-ons must take care "
                        "to unregister any registered stylesheets "
                        "at shutdown"}}},
-    "nsIWindowMediator":
+    u"nsIWindowMediator":
        {"value":
            {"registerNotification":
                {"dangerous":
@ -96,9 +96,9 @@ INTERFACES = {
                        "Authors of bootstrapped add-ons must take care "
                        "to remove any added observers "
                        "at shutdown"}}},
-    "nsIWindowWatcher":
+    u"nsIWindowWatcher":
        {"value":
-            {"addListener":
+            {u"addListener":
                {"dangerous":
                    lambda a, t, e:
                        e.get_resource("em:bootstrap") and \
@ -109,126 +109,126 @@ INTERFACES = {
 # GLOBAL_ENTITIES is also representative of the `window` object.
 GLOBAL_ENTITIES = {
-    "window": {"value": lambda: GLOBAL_ENTITIES},
+    u"window": {"value": lambda: GLOBAL_ENTITIES},
-    "document":
+    u"document":
-        {"value": {"createElement":
+        {"value": {u"createElement":
                       {"dangerous":
                            lambda a, t: t(a[0]).get_literal_value()
                                                .lower() == "script"},
-                   "createElementNS":
+                   u"createElementNS":
                       {"dangerous":
                            lambda a, t: t(a[0]).get_literal_value()
                                                .lower() == "script"}}},
    # The nefariuos timeout brothers!
-    "setTimeout": {"dangerous": actions._call_settimeout},
+    u"setTimeout": {"dangerous": actions._call_settimeout},
-    "setInterval": {"dangerous": actions._call_settimeout},
+    u"setInterval": {"dangerous": actions._call_settimeout},
-    "encodeURI": {"readonly": True},
+    u"encodeURI": {"readonly": True},
-    "decodeURI": {"readonly": True},
+    u"decodeURI": {"readonly": True},
-    "encodeURIComponent": {"readonly": True},
+    u"encodeURIComponent": {"readonly": True},
-    "decodeURIComponent": {"readonly": True},
+    u"decodeURIComponent": {"readonly": True},
-    "escape": {"readonly": True},
+    u"escape": {"readonly": True},
-    "unescape": {"readonly": True},
+    u"unescape": {"readonly": True},
-    "isFinite": {"readonly": True},
+    u"isFinite": {"readonly": True},
-    "isNaN": {"readonly": True},
+    u"isNaN": {"readonly": True},
-    "parseFloat": {"readonly": True},
+    u"parseFloat": {"readonly": True},
-    "parseInt": {"readonly": True},
+    u"parseInt": {"readonly": True},
-    "eval": {"dangerous": True},
+    u"eval": {"dangerous": True},
-    "Function": {"dangerous": True},
+    u"Function": {"dangerous": True},
-    "Object": {"value": {"prototype": {"dangerous": True},
+    u"Object": {"value": {u"prototype": {"dangerous": True},
-                         "constructor":  # Just an experiment for now
+                          u"constructor":  # Just an experiment for now
                              {"value": lambda: GLOBAL_ENTITIES["Function"]}}},
-    "String": {"value": {"prototype": {"dangerous": True}}},
+    u"String": {"value": {u"prototype": {"dangerous": True}}},
-    "Array": {"value": {"prototype": {"dangerous": True}}},
+    u"Array": {"value": {u"prototype": {"dangerous": True}}},
-    "Number": {"value": {"prototype": {"dangerous": True}}},
+    u"Number": {"value": {u"prototype": {"dangerous": True}}},
-    "Boolean": {"value": {"prototype": {"dangerous": True}}},
+    u"Boolean": {"value": {u"prototype": {"dangerous": True}}},
-    "RegExp": {"value": {"prototype": {"dangerous": True}}},
+    u"RegExp": {"value": {u"prototype": {"dangerous": True}}},
-    "Date": {"value": {"prototype": {"dangerous": True}}},
+    u"Date": {"value": {u"prototype": {"dangerous": True}}},
-    "Math": {"readonly": True},
+    u"Math": {"readonly": True},
-    "netscape":
+    u"netscape":
-        {"value": {"security":
+        {"value": {u"security":
-                       {"value": {"PrivilegeManager":
+                       {"value": {u"PrivilegeManager":
-                                      {"value": {"enablePrivilege":
+                                      {"value": {u"enablePrivilege":
                                                     {"dangerous": True}}}}}}},
-    "navigator":
+    u"navigator":
-        {"value": {"wifi": {"dangerous": True},
+        {"value": {u"wifi": {"dangerous": True},
-                   "geolocation": {"dangerous": True}}},
+                   u"geolocation": {"dangerous": True}}},
-    "Components":
+    u"Components":
        {"readonly": True,
         "value":
-             {"classes":
+             {u"classes":
                  {"xpcom_wildcard": True,
                   "value":
-                       {"createInstance":
+                       {u"createInstance":
                           {"return": call_definitions.xpcom_constructor("createInstance")},
-                        "getService":
+                        u"getService":
                           {"return": call_definitions.xpcom_constructor("getService")}}},
              "utils":
-                  {"value": {"evalInSandbox":
+                  {"value": {u"evalInSandbox":
                                 {"dangerous": True},
-                             "import":
+                             u"import":
                                 {"dangerous":
                                      lambda a, t:
                                        a and \
-                                        str(t(a[0]).get_literal_value())
+                                        unicode(t(a[0]).get_literal_value())
                                            .count("ctypes.jsm")}}},
-              "interfaces":
+              u"interfaces":
-                  {"value": {"nsIXMLHttpRequest":
+                  {"value": {u"nsIXMLHttpRequest":
                                {"xpcom_map":
                                     lambda:
                                        GLOBAL_ENTITIES["XMLHttpRequest"]},
-                             "nsICategoryManager":
+                             u"nsICategoryManager":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsICategoryManager"]},
-                             "nsIComponentRegistrar":
+                             u"nsIComponentRegistrar":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIComponentRegistrar"]},
-                             "nsIObserverService":
+                             u"nsIObserverService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIObserverService"]},
-                             "nsIResProtocolHandler":
+                             u"nsIResProtocolHandler":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIResProtocolHandler"]},
-                             "nsIStyleSheetService":
+                             u"nsIStyleSheetService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIStyleSheetService"]},
-                             "nsIStringBundleService":
+                             u"nsIStringBundleService":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIStringBundleService"]},
-                             "nsIWindowMediator":
+                             u"nsIWindowMediator":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIWindowMediator"]},
-                             "nsIWindowWatcher":
+                             u"nsIWindowWatcher":
                                {"xpcom_map":
                                     lambda:
                                        INTERFACES["nsIWindowWatcher"]},
-                             "nsIProcess":
+                             u"nsIProcess":
                                {"dangerous": True},
-                             "nsIDOMGeoGeolocation":
+                             u"nsIDOMGeoGeolocation":
                                {"dangerous": True},
-                             "nsIX509CertDB":
+                             u"nsIX509CertDB":
                                {"dangerous": True},
-                             "mozIJSSubScriptLoader":
+                             u"mozIJSSubScriptLoader":
                                {"dangerous": True}}}}},
-    "extensions": {"dangerous": True},
+    u"extensions": {"dangerous": True},
-    "xpcnativewrappers": {"dangerous": True},
+    u"xpcnativewrappers": {"dangerous": True},
-    "XMLHttpRequest":
+    u"XMLHttpRequest":
        {"value":
-             {"open": {"dangerous":
+             {u"open": {"dangerous":
                           # Ban syncrhonous XHR by making sure the third arg
                           # is absent and false.
                           lambda a, t:
@ -241,7 +241,7 @@ GLOBAL_ENTITIES = {
                               "connections."}}},
    # Global properties are inherently read-only, though this formalizes it.
-    "Infinity": {"readonly": True},
+    u"Infinity": {"readonly": True},
-    "NaN": {"readonly": True},
+    u"NaN": {"readonly": True},
-    "undefined": {"readonly": True},
+    u"undefined": {"readonly": True},
    }
--- a/validator/testcases/javascript/spidermonkey.py
+++ b/validator/testcases/javascript/spidermonkey.py
@ -1,3 +1,4 @@
 import codecs
 import json
 import os
 import re
@ -7,7 +8,7 @@ from cStringIO import StringIO
 from validator.constants import SPIDERMONKEY_INSTALLATION
 from validator.contextgenerator import ContextGenerator
-from validator.textfilter import *
+import validator.unicodehelper as unicodehelper
 JS_ESCAPE = re.compile("\\\\+[ux]", re.I)
@ -81,59 +82,21 @@ def prepare_code(code, err, filename):
    # slash: a character is necessary to prevent bad identifier errors
    code = JS_ESCAPE.sub("u", code)
-    encoding = None
+    code = unicodehelper.decode(code)
    try:
        code = unicode(code)  # Make sure we can get a Unicode representation
        code = strip_weird_chars(code, err=err, name=filename)
    except UnicodeDecodeError:
        # If it's not an easily decodeable encoding, detect it and decode that
        code = filter_ascii(code)
    return code
 def strip_weird_chars(chardata, err=None, name=""):
    line_num = 1
    out_code = StringIO()
    has_warned_ctrlchar = False
    for line in chardata.split("\n"):
        charpos = 0
        for char in line:
            if is_standard_ascii(char):
                out_code.write(char)
            else:
                if not has_warned_ctrlchar and err is not None:
                    err.warning(("testcases_scripting",
                                 "_get_tree",
                                 "control_char_filter"),
                                "Invalid control character in JS file",
                                "An invalid character (ASCII 0-31, except CR "
                                "and LF) has been found in a JS file. These "
                                "are considered unsafe and should be removed.",
                                filename=name,
                                line=line_num,
                                column=charpos,
                                context=ContextGenerator(chardata))
                has_warned_ctrlchar = True
            charpos += 1
        out_code.write("\n")
        line_num += 1
    return out_code.getvalue()
 def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    "Returns an AST tree of the JS passed in `code`."
    if not code:
        return None
-    temp = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+    code = unicodehelper.decode(code)
-    temp.write(code)
+
    temp = tempfile.NamedTemporaryFile(mode="w+b", delete=False)
    #temp.write(codecs.BOM_UTF8)
    temp.write(code.encode("utf_8"))
    temp.flush()
    data = """try{
@ -147,7 +110,7 @@ def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    }""" % json.dumps(temp.name)
    try:
-        cmd = [shell, "-e", data]
+        cmd = [shell, "-e", data, "-U"]
        try:
            shell_obj = subprocess.Popen(cmd,
                                   shell=False,
@ -171,11 +134,7 @@ def _get_tree(code, shell=SPIDERMONKEY_INSTALLATION):
    if not data:
        raise JSReflectException("Reflection failed")
-    try:
+    data = unicodehelper.decode(data)
        data = unicode(data)
    except UnicodeDecodeError:
        data = unicode(filter_ascii(data))
    parsed = json.loads(data, strict=False)
    if "error" in parsed and parsed["error"]:
--- a/validator/testcases/javascript/traverser.py
+++ b/validator/testcases/javascript/traverser.py
@ -40,12 +40,14 @@ class MockBundler:
        # Increment the message counter
        self.message_count += 1
-        self.ids.append(id)
+        self.ids.append(err_id)
        error = unicode(error)
        print "-" * 30
-        print error
+        print error.encode("ascii", "replace")
        print "~" * len(error)
-        if isinstance(description, str):
+        if isinstance(description, types.StringTypes):
            print description
        else:
            # Errors can have multiple lines
@ -105,12 +107,14 @@ class Traverser:
            output = data
            if isinstance(data, JSObject) or isinstance(data, JSContext):
                output = data.output()
-            print ". " * self.debug_level + output
+
            output = unicode(output)
            print ". " * self.debug_level + output.encode("ascii", "replace")
    def run(self, data):
        if DEBUG:
            x = open("/tmp/output.js", "w")
-            x.write(str(data))
+            x.write(unicode(data))
            x.close()
        if "type" not in data or not self._can_handle_node(data["type"]):
@ -189,7 +193,7 @@ class Traverser:
        if action is not None:
            action_result = action(self, node)
            self._debug("ACTION>>%s (%s)" %
-                    ("halt>>%s" % str(action_result) if
+                    ("halt>>%s" % unicode(action_result) if
                        action_result else
                        "continue",
                     node["type"]))
@ -350,7 +354,8 @@ class Traverser:
                                  "_build_global",
                                  "dangerous_global"),
                                 "Dangerous Global Object",
-                                 [dang if isinstance(dang, str) else
+                                 [dang if
                                  isinstance(dang, types.StringTypes) else
                                  "A dangerous or banned global object was "
                                  "accessed by some JavaScript code.",
                                  "Accessed object: %s" % name],
--- a/validator/testcases/markup/markuptester.py
+++ b/validator/testcases/markup/markuptester.py
@ -1,4 +1,3 @@
 import re
 try:
    from HTMLParser import HTMLParser
@ -6,6 +5,7 @@ except ImportError:  # pragma: no cover
    from html.parser import HTMLParser
 import validator.testcases.scripting as scripting
 import validator.unicodehelper as unicodehelper
 from validator.testcases.markup import csstester
 from validator.contextgenerator import ContextGenerator
 from validator.constants import *
@ -51,7 +51,7 @@ class MarkupParser(HTMLParser):
        self.xml_state = []
        self.xml_buffer = []
-        self.reported = {}
+        self.reported = set()
    def process(self, filename, data, extension="xul"):
        """Processes data by splitting it into individual lines, then
@ -61,7 +61,7 @@ class MarkupParser(HTMLParser):
        self.filename = filename
        self.extension = extension
-        self.reported = {}
+        self.reported = set()
        self.context = ContextGenerator(data)
@ -100,6 +100,8 @@ class MarkupParser(HTMLParser):
        try:
            self.feed(line + "\n")
        except UnicodeDecodeError:
            raise
        except Exception as inst:
            if DEBUG:  # pragma: no cover
                print self.xml_state, inst
@ -107,7 +109,7 @@ class MarkupParser(HTMLParser):
            if "markup" in self.reported:
                return
-            if "script" in self.xml_state or (
+            if ("script" in self.xml_state or
                self.debug and "testscript" in self.xml_state):
                if "script_comments" in self.reported or not self.strict:
                    return
@ -122,7 +124,7 @@ class MarkupParser(HTMLParser):
                                self.filename,
                                line=self.line,
                                context=self.context)
-                self.reported["script_comments"] = True
+                self.reported.add("script_comments")
                return
            if self.strict:
@ -136,7 +138,7 @@ class MarkupParser(HTMLParser):
                                 self.filename,
                                 line=self.line,
                                 context=self.context)
-            self.reported["markup"] = True
+            self.reported.add("markup")
    def handle_startendtag(self, tag, attrs):
        # Self closing tags don't have an end tag, so we want to
@ -154,7 +156,7 @@ class MarkupParser(HTMLParser):
            self_closing = tag in SELF_CLOSING_TAGS
        if DEBUG:  # pragma: no cover
-            print self.xml_state, tag, self_closing
+            print "S: ", self.xml_state, tag, self_closing
        # A fictional tag for testing purposes.
        if tag == "xbannedxtestx":
@ -286,17 +288,19 @@ class MarkupParser(HTMLParser):
            return
        self.xml_state.append(tag)
-        self.xml_buffer.append("")
+        self.xml_buffer.append(unicode(""))
    def handle_endtag(self, tag):
        tag = tag.lower()
        if DEBUG:  # pragma: no cover
-            print tag, self.xml_state
+            print "E: ", tag, self.xml_state
        if not self.xml_state:
            if "closing_tags" in self.reported or not self.strict:
                if DEBUG:
                    print "Unstrict; extra closing tags ------"
                return
            self.err.warning(("testcases_markup_markuptester",
                              "handle_endtag",
@ -307,16 +311,18 @@ class MarkupParser(HTMLParser):
                             self.filename,
                             line=self.line,
                             context=self.context)
-            self.reported["closing_tags"] = True
+            self.reported.add("closing_tags")
            if DEBUG:  # pragma: no cover
                print "Too many closing tags ------"
            return
-        elif "script" in self.xml_state:
+        elif "script" in self.xml_state[:-1]:
            # If we're in a script tag, nothing else matters. Just rush
            # everything possible into the xml buffer.
            self._save_to_buffer("</" + tag + ">")
            if DEBUG:
                print "Markup as text in script ------"
            return
        elif tag not in self.xml_state:
@ -344,6 +350,8 @@ class MarkupParser(HTMLParser):
        # classifies as a self-closing tag, we just recursively close
        # down to the level of the tag we're actualy closing.
        if old_state != tag and old_state in SELF_CLOSING_TAGS:
            if DEBUG:
                print "Self closing tag cascading down ------"
            return self.handle_endtag(tag)
        # If this is an XML-derived language, everything must nest
@ -365,7 +373,10 @@ class MarkupParser(HTMLParser):
            if DEBUG:  # pragma: no cover
                print "Invalid markup nesting ------"
        data_buffer = data_buffer.strip()
        # Perform analysis on collected data.
        if data_buffer:
            if tag == "script":
                scripting.test_js_snippet(self.err,
                                          data_buffer,
@ -413,6 +424,8 @@ class MarkupParser(HTMLParser):
        if not self.xml_buffer:
            return
        data = unicodehelper.decode(data)
        self.xml_buffer[-1] += data
    def _format_args(self, args):
--- a/validator/unicodehelper.py
+++ b/validator/unicodehelper.py
@ -0,0 +1,55 @@
 import codecs
 import textfilter
 # Many thanks to nmaier for inspiration and code in this module
 UNICODES = [
    (codecs.BOM_UTF8, "utf-8"),
    (codecs.BOM_UTF32_LE, "utf-32-le"),
    (codecs.BOM_UTF32_BE, "utf-32-be"),
    (codecs.BOM_UTF16_LE, "utf-16-le"),
    (codecs.BOM_UTF16_BE, "utf-16-be"),
    ]
 COMMON_ENCODINGS = ("utf-16", "latin_1", "ascii")
 def decode(data):
    """
    Decode data employing some charset detection and including unicode BOM
    stripping.
    """
    # Don't make more work than we have to.
    if not isinstance(data, str):
        return data
    # Detect standard unicodes.
    for bom, encoding in UNICODES:
        if data.startswith(bom):
            return unicode(data[len(bom):], encoding, "ignore")
    # Try straight UTF-8
    try:
        return unicode(data, "utf-8")
    except:
        pass
    # Test for latin_1, because it can be matched as UTF-16
    # Somewhat of a hack, but it works and is about a thousand times faster
    # than using chardet.
    if all(ord(c) < 256 for c in data):
        try:
            return unicode(data, "latin_1")
        except:
            pass
    # Test for various common encodings.
    for encoding in COMMON_ENCODINGS:
        try:
            return unicode(data, encoding)
        except UnicodeDecodeError:
            pass
    # Anything else gets filtered.
    return unicode(textfilter.filter_ascii(data), errors="replace")