No bug, update Readability and JSDOMParser to github tip, rs=me

--HG-- extra : rebase_source : dbfc50e5de39ea8ac7190d48af10121a91d77411
2015-04-02 21:48:31 +01:00 · 2015-04-02 21:48:31 +01:00 · 352a7c0220
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@ -615,6 +615,7 @@
        this.childNodes[i].parentNode = null;
      }
      this.childNodes = node.childNodes;
+      this.children = node.children;
      for (var i = this.childNodes.length; --i >= 0;) {
        this.childNodes[i].parentNode = this;
      }
@ -628,6 +629,7 @@

      var node = new Text();
      this.childNodes = [ node ];
+      this.children = [];
      node.textContent = text;
      node.parentNode = this;
    },
@ -924,14 +926,59 @@
    },

    readScript: function (node) {
-      var index = this.html.indexOf("</script>", this.currentChar);
-      if (index === -1) {
-        index = this.html.length;
+      while (this.currentChar < this.html.length) {
+        var c = this.nextChar();
+        var nextC = this.peekNext();
+        if (c === "<") {
+          if (nextC === "!" || nextC === "?") {
+            // We're still before the ! or ? that is starting this comment:
+            this.currentChar++;
+            node.appendChild(this.discardNextComment());
+            continue;
+          }
+          if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
+            // Go back before the '<' so we find the end tag.
+            this.currentChar--;
+            // Done with this script tag, the caller will close:
+            return;
+          }
+        }
+        // Either c wasn't a '<' or it was but we couldn't find either a comment
+        // or a closing script tag, so we should just parse as text until the next one
+        // comes along:
+
+        var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
+        var textNode = haveTextNode ? node.lastChild : new Text();
+        var n = this.html.indexOf("<", this.currentChar);
+        // Decrement this to include the current character *afterwards* so we don't get stuck
+        // looking for the same < all the time.
+        this.currentChar--;
+        if (n === -1) {
+          textNode.textContent += this.html.substring(this.currentChar, this.html.length);
+          this.currentChar = this.html.length;
+        } else {
+          textNode.textContent += this.html.substring(this.currentChar, n);
+          this.currentChar = n;
+        }
+        if (!haveTextNode)
+          node.appendChild(textNode);
      }
-      var txt = new Text();
-      txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
-      node.appendChild(txt);
-      this.currentChar = index;
+    },
+
+    discardNextComment: function() {
+      if (this.match("--")) {
+        this.discardTo("-->");
+      } else {
+        var c = this.nextChar();
+        while (c !== ">") {
+          if (c === undefined)
+            return null;
+          if (c === '"' || c === "'")
+            this.readString(c);
+          c = this.nextChar();
+        }
+      }
+      return new Comment();
    },


@ -969,20 +1016,9 @@
      // them away in readChildren()). So just returning an empty Comment node
      // here is sufficient.
      if (c === "!" || c === "?") {
+        // We're still before the ! or ? that is starting this comment:
        this.currentChar++;
-        if (this.match("--")) {
-          this.discardTo("-->");
-        } else {
-          var c = this.nextChar();
-          while (c !== ">") {
-            if (c === undefined)
-              return null;
-            if (c === '"' || c === "'")
-              this.readString(c);
-            c = this.nextChar();
-          }
-        }
-        return new Comment();
+        return this.discardNextComment();
      }

      // If we're reading a closing tag, return null. This means we've reached
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@ -95,7 +95,7 @@ Readability.prototype = {
  // All of the regular expressions in use within readability.
  // Defined up here so we don't instantiate them repeatedly in loops.
  REGEXPS: {
-    unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
+    unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
    okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
    positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
    negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
@ -1448,6 +1448,29 @@ Readability.prototype = {
    });
  },

+  /**
+   * Check if a given node has one of its ancestor tag name matching the
+   * provided one.
+   * @param  HTMLElement node
+   * @param  String      tagName
+   * @param  Number      maxDepth
+   * @return Boolean
+   */
+  _hasAncestorTag: function(node, tagName, maxDepth) {
+    maxDepth = maxDepth || 3;
+    tagName = tagName.toUpperCase();
+    var depth = 0;
+    while (node.parentNode) {
+      if (depth > maxDepth)
+        return false;
+      if (node.parentNode.tagName === tagName)
+        return true;
+      node = node.parentNode;
+      depth++;
+    }
+    return false;
+  },
+
  /**
   * Clean an element of all tags of type "tag" if they look fishy.
   * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@ -1493,8 +1516,7 @@ Readability.prototype = {
        var linkDensity = this._getLinkDensity(tagsList[i]);
        var contentLength = this._getInnerText(tagsList[i]).length;
        var toRemove = false;
-
-        if (img > p) {
+        if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
          toRemove = true;
        } else if (li > p && tag !== "ul" && tag !== "ol") {
          toRemove = true;
@ -1510,8 +1532,9 @@ Readability.prototype = {
          toRemove = true;
        }

-        if (toRemove)
+        if (toRemove) {
          tagsList[i].parentNode.removeChild(tagsList[i]);
+        }
      }
    }
  },