From 6ee9eceb0536288790e3b7b84ab6fe654bad88ac Mon Sep 17 00:00:00 2001
From: Matt Basta <mbasta@mozilla.com>
Date: Thu, 29 Jul 2010 15:06:57 -0700
Subject: [PATCH] Fixed bug with CDATA in markuptester; Error messaging fixes

---
 validator/errorbundler.py                  | 149 ++++++++++-----------
 validator/main.py                          |   6 +
 validator/testcases/content.py             |   2 +-
 validator/testcases/markup/csstester.py    |  10 +-
 validator/testcases/markup/markuptester.py | 142 ++++++++++++++------
 validator/typedetection.py                 |   5 +-
 6 files changed, 193 insertions(+), 121 deletions(-)

diff --git a/validator/errorbundler.py b/validator/errorbundler.py
index f76fe79..5a377cf 100644
--- a/validator/errorbundler.py
+++ b/validator/errorbundler.py
@@ -20,8 +20,10 @@ class ErrorBundle(object):
         self.errors = []
         self.warnings = []
         self.infos = []
+        self.message_tree = {}
         
         self.metadata = {}
+        self.cluster = False
         
         self.subpackages = []
         self.package_stack = []
@@ -35,7 +37,8 @@ class ErrorBundle(object):
         
     def error(self, err_id, error, description='', filename='', line=0):
         "Stores an error message for the validation process"
-        self.errors.append({"id": err_id,
+        self._save_message(self.errors,
+                           {"id": err_id,
                             "message": error,
                             "description": description,
                             "file": filename,
@@ -44,22 +47,39 @@ class ErrorBundle(object):
         
     def warning(self, err_id, warning, description='', filename='', line=0):
         "Stores a warning message for the validation process"
-        self.warnings.append({"id": err_id,
-                              "message": warning,
-                              "description": description,
-                              "file": filename,
-                              "line": line})
+        self._save_message(self.warnings,
+                           {"id": err_id,
+                            "message": warning,
+                            "description": description,
+                            "file": filename,
+                            "line": line})
         return self
 
     def info(self, err_id, info, description='', filename='', line=0):
         "Stores an informational message about the validation"
-        self.infos.append({"id": err_id,
-                           "message": info,
-                           "description": description,
-                           "file": filename,
-                           "line": line})
+        self._save_message(self.infos,
+                           {"id": err_id,
+                            "message": info,
+                            "description": description,
+                            "file": filename,
+                            "line": line})
         return self
         
+    def _save_message(self, stack, message):
+        "Stores a message in the appropriate message stack."
+        
+        stack.append(message)
+        
+        tree = self.message_tree
+        last_id = None
+        for eid in message["id"]:
+            if last_id is not None:
+                tree = tree[last_id]
+            if eid not in tree:
+                tree[eid] = {}
+            last_id = eid
+        tree[last_id] = message
+        
     def set_type(self, type_):
         "Stores the type of addon we're scanning"
         self.detected_type = type_
@@ -83,7 +103,6 @@ class ErrorBundle(object):
         
         self.resources[name] = resource
         
-    
     def is_nested_package(self):
         "Returns whether the current package is within a PACKAGE_MULTI"
         
@@ -96,12 +115,17 @@ class ErrorBundle(object):
                                  "warnings": self.warnings,
                                  "infos": self.infos,
                                  "detected_type": self.detected_type,
-                                 "resources": self.resources})
+                                 "resources": self.resources,
+                                 "message_tree": self.message_tree})
+        
+        # Note that the message tree is not saved because it is simply rebuilt
+        # when the messages are rewritten.
         
         self.errors = []
         self.warnings = []
         self.infos = []
         self.resources = {}
+        self.message_tree = {}
         
         self.package_stack.append(new_file)
     
@@ -113,6 +137,7 @@ class ErrorBundle(object):
         errors = self.errors
         warnings = self.warnings
         infos = self.infos
+        # We only rebuild message_tree anyway. No need to restore.
         
         # Copy the existing state back into place
         self.errors = state["errors"]
@@ -120,51 +145,34 @@ class ErrorBundle(object):
         self.infos = state["infos"]
         self.detected_type = state["detected_type"]
         self.resources = state["resources"]
+        self.message_tree = state["message_tree"]
         
         name = self.package_stack.pop()
         
+        self._merge_messages(errors, self.error, name)
+        self._merge_messages(warnings, self.warning, name)
+        self._merge_messages(infos, self.info, name)
+        
+    
+    def _merge_messages(self, messages, callback, name):
+        "Merges a stack of messages into another stack of messages"
+        
         # Overlay the popped warnings onto the existing ones.
-        for error in errors:
+        for message in messages:
             trace = [name]
-            
             # If there are sub-sub-packages, they'll be in a list.
-            if type(error["file"]) is list:
-                trace.extend(error["file"])
+            if isinstance(message["file"], list):
+                trace.extend(message["file"])
             else:
-                trace.append(error["file"])
+                trace.append(message["file"])
             
             # Write the errors with the file structure delimited by
             # right carets.
-            self.error(error["id"],
-                       "%s > %s" % (name, error["message"]),
-                       error["description"],
-                       trace,
-                       error["line"])
-                       
-        for warning in warnings:
-            trace = [name]
-            if type(warning["file"]) is list:
-                trace.extend(warning["file"])
-            else:
-                trace.append(warning["file"])
-            
-            self.warning(warning["id"],
-                         "%s > %s" % (name, warning["message"]),
-                         warning["description"],
-                         trace,
-                         warning["line"])
-        for info in infos:
-            trace = [name]
-            if type(info["file"]) is list:
-                trace.extend(info["file"])
-            else:
-                trace.append(info["file"])
-            
-            self.info(info["id"],
-                      "%s > %s" % (name, info["message"]),
-                      info["description"],
-                      trace,
-                      info["line"])
+            callback(message["id"],
+                     "%s > %s" % (name, message["message"]),
+                     message["description"],
+                     trace,
+                     message["line"])
         
     
     def _clean_description(self, message, json=False):
@@ -176,32 +184,20 @@ class ErrorBundle(object):
     def _clean_message(self, desc, json=False):
         "Cleans all the nasty whitespace from a string."
         
-        if not isinstance(desc, list):
-            desc = desc.split("\n")
-        
         output = []
-        merge = []
-        # Loop through each item in the multipart description.
-        for line in desc:
-            # If it's a string, add it to the line buffer for concat.
-            if isinstance(line, str):
-                merge.append(line.strip())
-            # If it's a list, just append it.
-            elif isinstance(line, list):
-                # While you're in here, concat and flush the line buffer.
-                if merge:
-                    output.append(" ".join(merge))
-                
-                # JSON keeps the structure, plain text normalizes.
-                if json:
-                    output.append(line)
-                else:
-                    output.append(" ".join(line))
-        # Finish up the line buffer.
-        if merge:
-            output.append(" ".join(merge))
         
-        return output
+        if not isinstance(desc, list):
+            lines = desc.splitlines()
+            for line in lines:
+                output.append(line.strip())
+            return " ".join(output)
+        else:
+            for line in desc:
+                output.append(self._clean_message(line, json))
+            if json:
+                return output
+            else:
+                return "\n".join(output)
     
     def print_json(self):
         "Prints a JSON summary of the validation operation."
@@ -286,8 +282,7 @@ class ErrorBundle(object):
         "Prints a message and takes care of all sorts of nasty code"
         
         # Load up the standard output.
-        output = [prefix,
-                  message["message"]]
+        output = [prefix, self._clean_message([message["message"]])]
         
         # We have some extra stuff for verbose mode.
         if verbose:
@@ -296,8 +291,8 @@ class ErrorBundle(object):
             # Detailed problem description.
             if message["description"]:
                 # These are dirty, so strip out whitespace and concat.
-                verbose_output.append("\n".join(
-                            self._clean_message(message["description"])))
+                verbose_output.append(
+                            self._clean_message(message["description"]))
             
             # If file information is availe, output that as well.
             files = message["file"]
@@ -327,8 +322,8 @@ class ErrorBundle(object):
     def _print_verbose(self, verbose):
         "Prints info code to help prevent code duplication"
         
-        mesg = "<<WHITE>>Notice:<<NORMAL>>\t"
         if self.infos and verbose:
+            mesg = "<<WHITE>>Notice:<<NORMAL>>\t"
             for info in self.infos:
                 self._print_message(mesg, info)
         
\ No newline at end of file
diff --git a/validator/main.py b/validator/main.py
index bbd8ef1..d7165c1 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -56,6 +56,11 @@ def main():
                         const=True,
                         help="""If the output format supports it, makes
                         the analysis summary include extra info.""")
+    parser.add_argument("--cluster",
+                        action="store_const",
+                        const=True,
+                        help="""Structures output in clustered format,
+                        grouping similar errors together.""")
     parser.add_argument("--file",
                         type=argparse.FileType("w"),
                         default=sys.stdout,
@@ -78,6 +83,7 @@ def main():
 
     error_bundle = ErrorBundle(args.file,
         not args.file == sys.stdout or args.boring)
+    error_bundle.cluster = args.cluster
 
     # Emulates the "$status" variable in the original validation.php
     # test file. Equal to "$status == STATUS_LISTED".
diff --git a/validator/testcases/content.py b/validator/testcases/content.py
index fe89fc3..4db0ff2 100644
--- a/validator/testcases/content.py
+++ b/validator/testcases/content.py
@@ -55,7 +55,7 @@ def test_packed_packages(err, package_contents=None, xpi_package=None):
             # Let the error bunder know we're in a sub-package.
             err.push_state(data["name_lower"])
             err.set_type(PACKAGE_SUBPACKAGE) # Subpackage
-            testendpoint_validator.test_inner_package(err,
+            testendpoint_validator.main.test_inner_package(err,
                                                       temp_contents,
                                                       sub_xpi)
             
diff --git a/validator/testcases/markup/csstester.py b/validator/testcases/markup/csstester.py
index d8b5e19..1dc7319 100644
--- a/validator/testcases/markup/csstester.py
+++ b/validator/testcases/markup/csstester.py
@@ -13,7 +13,10 @@ def test_css_file(err, filename, data, line_start=1):
     try:
         _run_css_tests(err, token_generator, filename, line_start - 1)
     except UnicodeDecodeError:
-        err.warning("Unicode decode error.",
+        err.warning(("testcases_markup_csstester",
+                     "test_css_file",
+                     "unicode_decode"),
+                    "Unicode decode error.",
                     """While decoding a CSS file, an unknown character
                     was encountered, causing some problems.""",
                     filename)
@@ -40,7 +43,10 @@ def _run_css_tests(err, tokens, filename, line_start=0):
         if tok_type == "IDENT":
             last_descriptor = value.lower()
             if value.startswith("-webkit"):
-                err.error("Blasphemy.",
+                err.error(("testcases_markup_csstester",
+                           "_run_css_tests",
+                           "webkit"),
+                          "Blasphemy.",
                           "WebKit descriptors? Really?",
                           filename,
                           line)
diff --git a/validator/testcases/markup/markuptester.py b/validator/testcases/markup/markuptester.py
index 228ab33..f4a44fc 100644
--- a/validator/testcases/markup/markuptester.py
+++ b/validator/testcases/markup/markuptester.py
@@ -8,7 +8,7 @@ except ImportError: # pragma: no cover
 from validator.testcases.markup import csstester
 from validator.constants import *
 
-DEBUG = True
+DEBUG = False
 
 UNSAFE_TAGS = ("script",
                "object",
@@ -45,7 +45,7 @@ class MarkupParser(HTMLParser):
         self.xml_state = []
         self.xml_buffer = []
         
-        self.alerted_script_comments = False
+        self.reported = {}
         
     def process(self, filename, data, extension="xul"):
         """Processes data by splitting it into individual lines, then
@@ -56,48 +56,78 @@ class MarkupParser(HTMLParser):
         self.filename = filename
         self.extension = extension
         
-        reported = False
+        self.reported = {}
         
         lines = data.split("\n")
+        line_buffer = []
+        force_buffer = False
         for line in lines:
-            try:
-                self.line += 1
-                self.feed(line + "\n")
-            except Exception as inst:
-                
-                if DEBUG: # pragma: no cover
-                    print self.xml_state, inst
-                
-                if reported:
-                    continue
-                
-                if "script" in self.xml_state or (
-                   self.debug and "testscript" in self.xml_state):
-                    if self.alerted_script_comments:
+            self.line += 1
+            
+            search_line = line
+            while True:
+                if "<![CDATA[" in search_line and not force_buffer:
+                    cdatapos = search_line.find("<![CDATA[")
+                    post_cdata = search_line[cdatapos:]
+                    
+                    if "]]>" in post_cdata:
+                        search_line = post_cdata[post_cdata.find("]]>")+3:]
                         continue
-                    self.err.info(("testcases_markup_markuptester",
-                                   "process",
-                                   "missing_script_comments"),
-                                  "Missing comments in <script> tag",
-                                  """Markup parsing errors occurred
-                                  while trying to parse the file. This
-                                  can likely be mitigated by wrapping
-                                  <script> tag contents in HTML comment
-                                  tags (<!-- -->)""",
-                                  self.filename,
-                                  self.line)
-                    self.alerted_script_comments = True
-                    continue
-                
-                self.err.warning(("testcases_markup_markuptester",
-                                  "process",
-                                  "parse_error"),
-                                 "Markup parsing error",
-                                 """There was an error parsing the
-                                 markup document.""",
-                                 self.filename,
-                                 self.line)
-                reported = True
+                    force_buffer = True
+                elif "]]>" in search_line and force_buffer:
+                    force_buffer = False
+                break
+            
+            if force_buffer:
+                line_buffer.append(line)
+            else:
+                if line_buffer:
+                    line_buffer.append(line)
+                    line = "\n".join(line_buffer)
+                    line_buffer = []
+                self._feed_parser(line)
+    
+    def _feed_parser(self, line):
+        "Feeds data into the parser"
+        
+        try:
+            self.feed(line + "\n")
+        except Exception as inst:
+            if DEBUG: # pragma: no cover
+                print self.xml_state, inst
+            
+            if "markup" in self.reported:
+                return
+            
+            if "script" in self.xml_state or (
+               self.debug and "testscript" in self.xml_state):
+                if "script_comments" in self.reported:
+                    return
+                self.err.info(("testcases_markup_markuptester",
+                               "_feed",
+                               "missing_script_comments"),
+                              "Missing comments in <script> tag",
+                              """Markup parsing errors occurred
+                              while trying to parse the file. This
+                              can likely be mitigated by wrapping
+                              <script> tag contents in HTML comment
+                              tags (<!-- -->)""",
+                              self.filename,
+                              self.line)
+                self.reported["script_comments"] = True
+                return
+            
+            self.err.warning(("testcases_markup_markuptester",
+                              "_feed",
+                              "parse_error"),
+                             "Markup parsing error",
+                             ["""There was an error parsing the markup
+                              document.""",
+                              str(inst)],
+                             self.filename,
+                             self.line)
+            self.reported["markup"] = True
+        
     
     def handle_startendtag(self, tag, attrs):
         # Self closing tags don't have an end tag, so we want to
@@ -251,6 +281,8 @@ class MarkupParser(HTMLParser):
             print tag, self.xml_state
         
         if not self.xml_state:
+            if "closing_tags" in self.reported:
+                return
             self.err.error(("testcases_markup_markuptester",
                             "handle_endtag",
                             "extra_closing_tags"),
@@ -259,6 +291,7 @@ class MarkupParser(HTMLParser):
                            than it has opening tags.""",
                            self.filename,
                            self.line)
+            self.reported["closing_tags"] = True
             if DEBUG: # pragma: no cover
                 print "Too many closing tags ------"
             return
@@ -328,6 +361,35 @@ class MarkupParser(HTMLParser):
         
     def handle_comment(self, data):
         self._save_to_buffer(data)
+        
+    def parse_marked_section(self, i, report=0):
+        rawdata= self.rawdata
+        _markedsectionclose = re.compile(r']\s*]\s*>')
+        
+        assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
+        sectName, j = self._scan_name( i+3, i )
+        if j < 0:
+            return j
+        if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
+            # look for standard ]]> ending
+            match= _markedsectionclose.search(rawdata, i+3)
+        else:
+            self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
+        if not match:
+            return -1
+        if report:
+            j = match.start(0)
+            self.unknown_decl(rawdata[i+3: j])
+        return match.end(0)
+        
+    def _unknown_decl(self, decl):
+        "Handles non-DOCTYPE SGML declarations in *ML documents."
+        #print decl
+        decl = decl.strip()
+        split_decl = decl.split()
+        
+        self.out_buffer[split_decl[1]] = split_decl[2].strip('\'"')
+        
     
     def _save_to_buffer(self, data):
         """Save data to the XML buffer for the current tag."""
diff --git a/validator/typedetection.py b/validator/typedetection.py
index 63df425..e31aff2 100644
--- a/validator/typedetection.py
+++ b/validator/typedetection.py
@@ -52,7 +52,10 @@ def detect_type(err, install_rdf=None, xpi_package=None):
                       "install.rdf")
             return
     else:
-        err.info("No <em:type> element found in install.rdf",
+        err.info(("typedetection",
+                  "detect_type",
+                  "no_em:type"),
+                 "No <em:type> element found in install.rdf",
                  """It isn't always required, but it is the most
                  reliable method for determining addon type.""",
                  "install.rdf")