No bug: update readability libs to the up-to-date github versions to include significant perf and quality improvements, rs=me

--HG-- extra : rebase_source : 464cf93b5110cc456454bab0b698bc10d32bea49
2015-03-20 20:50:45 -07:00 · 2015-03-20 20:50:45 -07:00 · 82c7c6de1e
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@ -1,3 +1,10 @@
+/*
+ * DO NOT MODIFY THIS FILE DIRECTLY!
+ *
+ * This is a shared library that is maintained in an external repo:
+ * https://github.com/mozilla/readability
+ */
+
 /* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */
@ -31,7 +38,7 @@
  }

  // When a style is set in JS, map it to the corresponding CSS attribute
-  let styleMap = {
+  var styleMap = {
    "alignmentBaseline": "alignment-baseline",
    "background": "background",
    "backgroundAttachment": "background-attachment",
@ -223,7 +230,7 @@
  };

  // Elements that can be self-closing
-  let voidElems = {
+  var voidElems = {
    "area": true,
    "base": true,
    "br": true,
@ -239,8 +246,10 @@
    "source": true,
  };

+  var whitespace = [" ", "\t", "\n", "\r"];
+
  // See http://www.w3schools.com/dom/dom_nodetype.asp
-  let nodeTypes = {
+  var nodeTypes = {
    ELEMENT_NODE: 1,
    ATTRIBUTE_NODE: 2,
    TEXT_NODE: 3,
@ -257,14 +266,12 @@

  function getElementsByTagName(tag) {
    tag = tag.toUpperCase();
-    let elems = [];
-    let allTags = (tag === "*");
+    var elems = [];
+    var allTags = (tag === "*");
    function getElems(node) {
-      let length = node.childNodes.length;
-      for (let i = 0; i < length; i++) {
-        let child = node.childNodes[i];
-        if (child.nodeType !== 1)
-          continue;
+      var length = node.children.length;
+      for (var i = 0; i < length; i++) {
+        var child = node.children[i];
        if (allTags || (child.tagName === tag))
          elems.push(child);
        getElems(child);
@ -274,7 +281,7 @@
    return elems;
  }

-  let Node = function () {};
+  var Node = function () {};

  Node.prototype = {
    attributes: null,
@ -283,18 +290,23 @@
    nodeName: null,
    parentNode: null,
    textContent: null,
+    nextSibling: null,
+    previousSibling: null,

    get firstChild() {
      return this.childNodes[0] || null;
    },

-    get nextSibling() {
-      if (this.parentNode) {
-        let childNodes = this.parentNode.childNodes;
-        return childNodes[childNodes.indexOf(this) + 1] || null;
-      }
+    get firstElementChild() {
+      return this.children[0] || null;
+    },

-      return null;
+    get lastChild() {
+      return this.childNodes[this.childNodes.length - 1] || null;
+    },
+
+    get lastElementChild() {
+      return this.children[this.children.length - 1] || null;
    },

    appendChild: function (child) {
@ -302,48 +314,152 @@
        child.parentNode.removeChild(child);
      }

+      var last = this.lastChild;
+      if (last)
+        last.nextSibling = child;
+      child.previousSibling = last;
+
+      if (child.nodeType === Node.ELEMENT_NODE) {
+        child.previousElementSibling = this.children[this.children.length - 1] || null;
+        this.children.push(child);
+        child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
+      }
      this.childNodes.push(child);
      child.parentNode = this;
    },

    removeChild: function (child) {
-      let childNodes = this.childNodes;
-      let childIndex = childNodes.indexOf(child);
+      var childNodes = this.childNodes;
+      var childIndex = childNodes.indexOf(child);
      if (childIndex === -1) {
        throw "removeChild: node not found";
      } else {
        child.parentNode = null;
+        var prev = child.previousSibling;
+        var next = child.nextSibling;
+        if (prev)
+          prev.nextSibling = next;
+        if (next)
+          next.previousSibling = prev;
+
+        if (child.nodeType === Node.ELEMENT_NODE) {
+          prev = child.previousElementSibling;
+          next = child.nextElementSibling;
+          if (prev)
+            prev.nextElementSibling = next;
+          if (next)
+            next.previousElementSibling = prev;
+          this.children.splice(this.children.indexOf(child), 1);
+        }
+
+        child.previousSibling = child.nextSibling = null;
+        child.previousElementSibling = child.nextElementSibling = null;
+
        return childNodes.splice(childIndex, 1)[0];
      }
    },

    replaceChild: function (newNode, oldNode) {
-      let childNodes = this.childNodes;
-      let childIndex = childNodes.indexOf(oldNode);
+      var childNodes = this.childNodes;
+      var childIndex = childNodes.indexOf(oldNode);
      if (childIndex === -1) {
        throw "replaceChild: node not found";
      } else {
+        // This will take care of updating the new node if it was somewhere else before:
        if (newNode.parentNode)
          newNode.parentNode.removeChild(newNode);

        childNodes[childIndex] = newNode;
+
+        // update the new node's sibling properties, and its new siblings' sibling properties
+        newNode.nextSibling = oldNode.nextSibling;
+        newNode.previousSibling = oldNode.previousSibling;
+        if (newNode.nextSibling)
+          newNode.nextSibling.previousSibling = newNode;
+        if (newNode.previousSibling)
+          newNode.previousSibling.nextSibling = newNode;
+
        newNode.parentNode = this;
+
+        // Now deal with elements before we clear out those values for the old node,
+        // because it can help us take shortcuts here:
+        if (newNode.nodeType === Node.ELEMENT_NODE) {
+          if (oldNode.nodeType === Node.ELEMENT_NODE) {
+            // Both were elements, which makes this easier, we just swap things out:
+            newNode.previousElementSibling = oldNode.previousElementSibling;
+            newNode.nextElementSibling = oldNode.nextElementSibling;
+            if (newNode.previousElementSibling)
+              newNode.previousElementSibling.nextElementSibling = newNode;
+            if (newNode.nextElementSibling)
+              newNode.nextElementSibling.previousElementSibling = newNode;
+            this.children[this.children.indexOf(oldNode)] = newNode;
+          } else {
+            // Hard way:
+            newNode.previousElementSibling = (function() {
+              for (var i = childIndex - 1; i >= 0; i--) {
+                if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+                  return childNodes[i];
+              }
+              return null;
+            })();
+            if (newNode.previousElementSibling) {
+              newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
+            } else {
+              newNode.nextElementSibling = (function() {
+                for (var i = childIndex + 1; i < childNodes.length; i++) {
+                  if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+                    return childNodes[i];
+                }
+                return null;
+              })();
+            }
+            if (newNode.previousElementSibling)
+              newNode.previousElementSibling.nextElementSibling = newNode;
+            if (newNode.nextElementSibling)
+              newNode.nextElementSibling.previousElementSibling = newNode;
+
+            if (newNode.nextElementSibling)
+              this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
+            else
+              this.children.push(newNode);
+          }
+        } else {
+          // new node is not an element node.
+          // if the old one was, update its element siblings:
+          if (oldNode.nodeType === Node.ELEMENT_NODE) {
+            if (oldNode.previousElementSibling)
+              oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
+            if (oldNode.nextElementSibling)
+              oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
+            this.children.splice(this.children.indexOf(oldNode), 1);
+          }
+          // If the old node wasn't an element, neither the new nor the old node was an element,
+          // and the children array and its members shouldn't need any updating.
+        }
+
+
        oldNode.parentNode = null;
+        oldNode.previousSibling = null;
+        oldNode.nextSibling = null;
+        if (oldNode.nodeType === Node.ELEMENT_NODE) {
+          oldNode.previousElementSibling = null;
+          oldNode.nextElementSibling = null;
+        }
        return oldNode;
      }
    }
  };

-  for (let i in nodeTypes) {
+  for (var i in nodeTypes) {
    Node[i] = Node.prototype[i] = nodeTypes[i];
  }

-  let Attribute = function (name, value) {
+  var Attribute = function (name, value) {
    this.name = name;
    this.value = value;
  };

-  let Comment = function () {
+  var Comment = function () {
    this.childNodes = [];
  };

@ -354,7 +470,7 @@
    nodeType: Node.COMMENT_NODE
  };

-  let Text = function () {
+  var Text = function () {
    this.childNodes = [];
  };

@ -366,9 +482,10 @@
    textContent: ""
  }

-  let Document = function () {
+  var Document = function () {
    this.styleSheets = [];
    this.childNodes = [];
+    this.children = [];
  };

  Document.prototype = {
@ -382,11 +499,11 @@

    getElementById: function (id) {
      function getElem(node) {
-        let length = node.childNodes.length;
+        var length = node.children.length;
        if (node.id === id)
          return node;
-        for (let i = 0; i < length; i++) {
-          let el = getElem(node.childNodes[i]);
+        for (var i = 0; i < length; i++) {
+          var el = getElem(node.children[i]);
          if (el)
            return el;
        }
@ -396,14 +513,16 @@
    },

    createElement: function (tag) {
-      let node = new Element(tag);
+      var node = new Element(tag);
      return node;
    }
  };

-  let Element = function (tag) {
+  var Element = function (tag) {
    this.attributes = [];
    this.childNodes = [];
+    this.children = [];
+    this.nextElementSibling = this.previousElementSibling = null;
    this.localName = tag.toLowerCase();
    this.tagName = tag.toUpperCase();
    this.style = new Style(this);
@ -454,16 +573,16 @@

    get innerHTML() {
      function getHTML(node) {
-        let i = 0;
+        var i = 0;
        for (i = 0; i < node.childNodes.length; i++) {
-          let child = node.childNodes[i];
+          var child = node.childNodes[i];
          if (child.localName) {
            arr.push("<" + child.localName);

            // serialize attribute list
-            for (let j = 0; j < child.attributes.length; j++) {
-              let attr = child.attributes[j];
-              let quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
+            for (var j = 0; j < child.attributes.length; j++) {
+              var attr = child.attributes[j];
+              var quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
              arr.push(" " + attr.name + '=' + quote + attr.value + quote);
            }

@ -484,30 +603,30 @@

      // Using Array.join() avoids the overhead from lazy string concatenation.
      // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
-      let arr = [];
+      var arr = [];
      getHTML(this);
      return arr.join("");
    },

    set innerHTML(html) {
-      let parser = new JSDOMParser();
-      let node = parser.parse(html);
-      for (let i = this.childNodes.length; --i >= 0;) {
+      var parser = new JSDOMParser();
+      var node = parser.parse(html);
+      for (var i = this.childNodes.length; --i >= 0;) {
        this.childNodes[i].parentNode = null;
      }
      this.childNodes = node.childNodes;
-      for (let i = this.childNodes.length; --i >= 0;) {
+      for (var i = this.childNodes.length; --i >= 0;) {
        this.childNodes[i].parentNode = this;
      }
    },

    set textContent(text) {
      // clear parentNodes for existing children
-      for (let i = this.childNodes.length; --i >= 0;) {
+      for (var i = this.childNodes.length; --i >= 0;) {
        this.childNodes[i].parentNode = null;
      }

-      let node = new Text();
+      var node = new Text();
      this.childNodes = [ node ];
      node.textContent = text;
      node.parentNode = this;
@ -515,9 +634,9 @@

    get textContent() {
      function getText(node) {
-        let nodes = node.childNodes;
-        for (let i = 0; i < nodes.length; i++) {
-          let child = nodes[i];
+        var nodes = node.childNodes;
+        for (var i = 0; i < nodes.length; i++) {
+          var child = nodes[i];
          if (child.nodeType === 3) {
            text.push(child.textContent);
          } else {
@ -528,14 +647,14 @@

      // Using Array.join() avoids the overhead from lazy string concatenation.
      // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
-      let text = [];
+      var text = [];
      getText(this);
      return text.join("");
    },

    getAttribute: function (name) {
-      for (let i = this.attributes.length; --i >= 0;) {
-        let attr = this.attributes[i];
+      for (var i = this.attributes.length; --i >= 0;) {
+        var attr = this.attributes[i];
        if (attr.name === name)
          return attr.value;
      }
@ -543,8 +662,8 @@
    },

    setAttribute: function (name, value) {
-      for (let i = this.attributes.length; --i >= 0;) {
-        let attr = this.attributes[i];
+      for (var i = this.attributes.length; --i >= 0;) {
+        var attr = this.attributes[i];
        if (attr.name === name) {
          attr.value = value;
          return;
@ -554,8 +673,8 @@
    },

    removeAttribute: function (name) {
-      for (let i = this.attributes.length; --i >= 0;) {
-        let attr = this.attributes[i];
+      for (var i = this.attributes.length; --i >= 0;) {
+        var attr = this.attributes[i];
        if (attr.name === name) {
          this.attributes.splice(i, 1);
          break;
@ -564,7 +683,7 @@
    }
  };

-  let Style = function (node) {
+  var Style = function (node) {
    this.node = node;
  };

@ -575,14 +694,14 @@
  // manipulations, so this should be okay.
  Style.prototype = {
    getStyle: function (styleName) {
-      let attr = this.node.getAttribute("style");
+      var attr = this.node.getAttribute("style");
      if (!attr)
        return undefined;

-      let styles = attr.split(";");
-      for (let i = 0; i < styles.length; i++) {
-        let style = styles[i].split(":");
-        let name = style[0].trim();
+      var styles = attr.split(";");
+      for (var i = 0; i < styles.length; i++) {
+        var style = styles[i].split(":");
+        var name = style[0].trim();
        if (name === styleName)
          return style[1].trim();
      }
@ -591,12 +710,12 @@
    },

    setStyle: function (styleName, styleValue) {
-      let value = this.node.getAttribute("style") || "";
-      let index = 0;
+      var value = this.node.getAttribute("style") || "";
+      var index = 0;
      do {
-        let next = value.indexOf(";", index) + 1;
-        let length = next - index - 1;
-        let style = (length > 0 ? value.substr(index, length) : value.substr(index));
+        var next = value.indexOf(";", index) + 1;
+        var length = next - index - 1;
+        var style = (length > 0 ? value.substr(index, length) : value.substr(index));
        if (style.substr(0, style.indexOf(":")).trim() === styleName) {
          value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : "");
          break;
@ -611,7 +730,7 @@

  // For each item in styleMap, define a getter and setter on the style
  // property.
-  for (let jsName in styleMap) {
+  for (var jsName in styleMap) {
    (function (cssName) {
      Style.prototype.__defineGetter__(jsName, function () {
        return this.getStyle(cssName);
@ -622,7 +741,7 @@
    }) (styleMap[jsName]);
  }

-  let JSDOMParser = function () {
+  var JSDOMParser = function () {
    this.currentChar = 0;

    // In makeElementNode() we build up many strings one char at a time. Using
@ -659,8 +778,8 @@
     * character and returns the text string in between.
     */
    readString: function (quote) {
-      let str;
-      let n = this.html.indexOf(quote, this.currentChar);
+      var str;
+      var n = this.html.indexOf(quote, this.currentChar);
      if (n === -1) {
        this.currentChar = this.html.length;
        str = null;
@ -677,9 +796,9 @@
     * pair and adds the result to the attributes list.
     */
    readAttribute: function (node) {
-      let name = "";
+      var name = "";

-      let n = this.html.indexOf("=", this.currentChar);
+      var n = this.html.indexOf("=", this.currentChar);
      if (n === -1) {
        this.currentChar = this.html.length;
      } else {
@ -692,14 +811,14 @@
        return;

      // After a '=', we should see a '"' for the attribute value
-      let c = this.nextChar();
+      var c = this.nextChar();
      if (c !== '"' && c !== "'") {
-        error("expecting '\"'");
+        error("Error reading attribute " + name + ", expecting '\"'");
        return;
      }

      // Read the attribute value (and consume the matching quote)
-      let value = this.readString(c);
+      var value = this.readString(c);

      if (!value)
        return;
@ -718,29 +837,30 @@
     *          Element
     */
    makeElementNode: function (retPair) {
-      let c = this.nextChar();
+      var c = this.nextChar();

      // Read the Element tag name
-      let strBuf = this.strBuf;
+      var strBuf = this.strBuf;
      strBuf.length = 0;
-      while (c !== " " && c !== ">" && c !== "/") {
+      while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") {
        if (c === undefined)
          return false;
        strBuf.push(c);
        c = this.nextChar();
      }
-      let tag = strBuf.join('');
+      var tag = strBuf.join('');

      if (!tag)
        return false;

-      let node = new Element(tag);
+      var node = new Element(tag);

      // Read Element attributes
      while (c !== "/" && c !== ">") {
        if (c === undefined)
          return false;
-        while (this.match(" "));
+        while (whitespace.indexOf(this.html[this.currentChar++]) != -1);
+        this.currentChar--;
        c = this.nextChar();
        if (c !== "/" && c !== ">") {
          --this.currentChar;
@ -749,12 +869,12 @@
      }

      // If this is a self-closing tag, read '/>'
-      let closed = tag in voidElems;
+      var closed = tag in voidElems;
      if (c === "/") {
        closed = true;
        c = this.nextChar();
        if (c !== ">") {
-          error("expected '>'");
+          error("expected '>' to close " + tag);
          return false;
        }
      }
@ -771,7 +891,7 @@
     * @returns whether input matched string
     */
    match: function (str) {
-      let strlen = str.length;
+      var strlen = str.length;
      if (this.html.substr(this.currentChar, strlen) === str) {
        this.currentChar += strlen;
        return true;
@ -784,7 +904,7 @@
     * and including the matched string.
     */
    discardTo: function (str) {
-      let index = this.html.indexOf(str, this.currentChar) + str.length;
+      var index = this.html.indexOf(str, this.currentChar) + str.length;
      if (index === -1)
        this.currentChar = this.html.length;
      this.currentChar = index;
@ -794,16 +914,27 @@
     * Reads child nodes for the given node.
     */
    readChildren: function (node) {
-      let child;
+      var child;
      while ((child = this.readNode())) {
        // Don't keep Comment nodes
        if (child.nodeType !== 8) {
-          node.childNodes.push(child);
-          child.parentNode = node;
+          node.appendChild(child);
        }
      }
    },

+    readScript: function (node) {
+      var index = this.html.indexOf("</script>", this.currentChar);
+      if (index === -1) {
+        index = this.html.length;
+      }
+      var txt = new Text();
+      txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
+      node.appendChild(txt);
+      this.currentChar = index;
+    },
+
+
    /**
     * Reads the next child node from the input. If we're reading a closing
     * tag, or if we've reached the end of input, return null.
@ -811,7 +942,7 @@
     * @returns the node
     */
    readNode: function () {
-      let c = this.nextChar();
+      var c = this.nextChar();
 
      if (c === undefined)
        return null;
@ -819,8 +950,8 @@
      // Read any text as Text node
      if (c !== "<") {
        --this.currentChar;
-        let node = new Text();
-        let n = this.html.indexOf("<", this.currentChar);
+        var node = new Text();
+        var n = this.html.indexOf("<", this.currentChar);
        if (n === -1) {
          node.textContent = this.html.substring(this.currentChar, this.html.length);
          this.currentChar = this.html.length;
@ -842,7 +973,7 @@
        if (this.match("--")) {
          this.discardTo("-->");
        } else {
-          let c = this.nextChar();
+          var c = this.nextChar();
          while (c !== ">") {
            if (c === undefined)
              return null;
@ -862,25 +993,32 @@
      }

      // Otherwise, we're looking at an Element node
-      let result = this.makeElementNode(this.retPair);
+      var result = this.makeElementNode(this.retPair);
      if (!result)
        return null;

-      let node = this.retPair[0];
-      let closed = this.retPair[1];
-      let localName = node.localName;
+      var node = this.retPair[0];
+      var closed = this.retPair[1];
+      var localName = node.localName;

      // If this isn't a void Element, read its child nodes
      if (!closed) {
+        if (localName == "script") {
+          this.readScript(node);
+        } else {
          this.readChildren(node);
-        let closingTag = "</" + localName + ">";
+        }
+        var closingTag = "</" + localName + ">";
        if (!this.match(closingTag)) {
          error("expected '" + closingTag + "'");
          return null;
        }
      }

-      if (localName === "title") {
+      // Only use the first title, because SVG might have other
+      // title elements which we don't care about (medium.com
+      // does this, at least).
+      if (localName === "title" && !this.doc.title) {
        this.doc.title = node.textContent.trim();
      } else if (localName === "head") {
        this.doc.head = node;
@ -898,14 +1036,14 @@
     */
    parse: function (html) {
      this.html = html;
-      let doc = this.doc = new Document();
+      var doc = this.doc = new Document();
      this.readChildren(doc);

      // If this is an HTML document, remove root-level children except for the
      // <html> node
      if (doc.documentElement) {
-        for (let i = doc.childNodes.length; --i >= 0;) {
-          let child = doc.childNodes[i];
+        for (var i = doc.childNodes.length; --i >= 0;) {
+          var child = doc.childNodes[i];
          if (child !== doc.documentElement) {
            doc.removeChild(child);
          }
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@ -102,16 +102,18 @@ Readability.prototype = {
    extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
    byline: /byline|author|dateline|writtenby/i,
    replaceFonts: /<(\/?)font[^>]*>/gi,
-    trim: /^\s+|\s+$/g,
    normalize: /\s{2,}/g,
-    videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    prevLink: /(prev|earl|old|new|<|«)/i,
-    whitespace: /^\s*$/
+    whitespace: /^\s*$/,
+    hasContent: /\S$/,
  },

  DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],

+  ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
+
  /**
   * Run any post-process modifications to article content as necessary.
   *
@ -204,7 +206,7 @@ Readability.prototype = {
        curTitle = this._getInnerText(hOnes[0]);
    }

-    curTitle = curTitle.replace(this.REGEXPS.trim, "");
+    curTitle = curTitle.trim();

    if (curTitle.split(' ').length <= 4)
      curTitle = origTitle;
@ -223,8 +225,8 @@ Readability.prototype = {

    // Remove all style tags in head
    var styleTags = doc.getElementsByTagName("style");
-    for (var st = 0; st < styleTags.length; st += 1) {
-      styleTags[st].textContent = "";
+    for (var st = styleTags.length - 1; st >= 0; st -= 1) {
+      styleTags[st].parentNode.removeChild(styleTags[st]);
    }

    if (doc.body) {
@ -305,6 +307,8 @@ Readability.prototype = {
  },

  _setNodeTag: function (node, tag) {
+    // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
+    // won't actually be set).
    node.localName = tag.toLowerCase();
    node.tagName = tag.toUpperCase();
  },
@ -407,6 +411,54 @@ Readability.prototype = {
    node.readability.contentScore += this._getClassWeight(node);
  },

+  _removeAndGetNext: function(node) {
+    var nextNode = this._getNextNode(node, true);
+    node.parentNode.removeChild(node);
+    return nextNode;
+  },
+
+  /**
+   * Traverse the DOM from node to node, starting at the node passed in.
+   * Pass true for the second parameter to indicate this node itself
+   * (and its kids) are going away, and we want the next node over.
+   *
+   * Calling this in a loop will traverse the DOM depth-first.
+   */
+  _getNextNode: function(node, ignoreSelfAndKids) {
+    // First check for kids if those aren't being ignored
+    if (!ignoreSelfAndKids && node.firstElementChild) {
+      return node.firstElementChild;
+    }
+    // Then for siblings...
+    if (node.nextElementSibling) {
+      return node.nextElementSibling;
+    }
+    // And finally, move up the parent chain *and* find a sibling
+    // (because this is depth-first traversal, we will have already
+    // seen the parent nodes themselves).
+    do {
+      node = node.parentNode;
+    } while (node && !node.nextElementSibling);
+    return node && node.nextElementSibling;
+  },
+
+  _checkByline: function(node, matchString) {
+    if (this._articleByline) {
+      return false;
+    }
+
+    if (node.getAttribute !== undefined) {
+      var rel = node.getAttribute("rel");
+    }
+
+    if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
+      this._articleByline = node.textContent.trim();
+      return true;
+    }
+
+    return false;
+  },
+
  /***
   * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
   *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@ -430,65 +482,37 @@ Readability.prototype = {
    // Check if any "dir" is set on the toplevel document element
    this._articleDir = doc.documentElement.getAttribute("dir");

-    //helper function used below in the 'while' loop:
-    function purgeNode(node, allElements) {
-      for (var i = node.childNodes.length; --i >= 0;) {
-        purgeNode(node.childNodes[i], allElements);
-      }
-      if (node._index !== undefined && allElements[node._index] == node)
-        delete allElements[node._index];
-    }
    while (true) {
      var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
-      var allElements = page.getElementsByTagName('*');

      // First, node prepping. Trash nodes that look cruddy (like ones with the
      // class name "comment", etc), and turn divs into P tags where they have been
      // used inappropriately (as in, where they contain no other block level elements.)
-      //
-      // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
-      // TODO: Shouldn't this be a reverse traversal?
-      var node = null;
-      var nodesToScore = [];
+      var elementsToScore = [];
+      var node = this._doc.documentElement;

-      // var each node know its index in the allElements array.
-      for (var i = allElements.length; --i >= 0;) {
-        allElements[i]._index = i;
-      }
+      while (node) {
+        var matchString = node.className + " " + node.id;

-      /**
-       * JSDOMParser returns static node lists, not live ones. When we remove
-       * an element from the document, we need to manually remove it - and all
-       * of its children - from the allElements array.
-       */
-      for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) {
-        if (!(node = allElements[nodeIndex]))
+        // Check to see if this node is a byline, and remove it if it is.
+        if (this._checkByline(node, matchString)) {
+          node = this._removeAndGetNext(node);
          continue;
-
-        var matchString = node.className + node.id;
-        if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) {
-          if (this._isValidByline(node.textContent)) {
-            this._articleByline = node.textContent.trim();
-            node.parentNode.removeChild(node);
-            purgeNode(node, allElements);
-            continue;
-          }
        }

        // Remove unlikely candidates
        if (stripUnlikelyCandidates) {
-          if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 &&
-            matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 &&
+          if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
+              !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
              node.tagName !== "BODY") {
            this.log("Removing unlikely candidate - " + matchString);
-            node.parentNode.removeChild(node);
-            purgeNode(node, allElements);
+            node = this._removeAndGetNext(node);
            continue;
          }
        }

        if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
-          nodesToScore[nodesToScore.length] = node;
+          elementsToScore.push(node);

        // Turn all divs that don't have children block level elements into p's
        if (node.tagName === "DIV") {
@ -496,34 +520,28 @@ Readability.prototype = {
          // element. DIVs with only a P element inside and no text content can be
          // safely converted into plain P elements to avoid confusing the scoring
          // algorithm with DIVs with are, in practice, paragraphs.
-          var pIndex = this._getSinglePIndexInsideDiv(node);
-
-          if (pIndex >= 0 || !this._hasChildBlockElement(node)) {
-            if (pIndex >= 0) {
-              var newNode = node.childNodes[pIndex];
+          if (this._hasSinglePInsideElement(node)) {
+            var newNode = node.firstElementChild;
            node.parentNode.replaceChild(newNode, node);
-              purgeNode(node, allElements);
-            } else {
+            node = newNode;
+          } else if (!this._hasChildBlockElement(node)) {
            this._setNodeTag(node, "P");
-              nodesToScore[nodesToScore.length] = node;
-            }
+            elementsToScore.push(node);
          } else {
            // EXPERIMENTAL
            for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
              var childNode = node.childNodes[i];
-              if (!childNode)
-                continue;
-
-              if (childNode.nodeType === 3) { // Node.TEXT_NODE
+              if (childNode.nodeType === Node.TEXT_NODE) {
                var p = doc.createElement('p');
                p.textContent = childNode.textContent;
                p.style.display = 'inline';
                p.className = 'readability-styled';
-                childNode.parentNode.replaceChild(p, childNode);
+                node.replaceChild(p, childNode);
              }
            }
          }
        }
+        node = this._getNextNode(node);
      }

      /**
@ -533,10 +551,10 @@ Readability.prototype = {
       * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
      **/
      var candidates = [];
-      for (var pt = 0; pt < nodesToScore.length; pt += 1) {
-        var parentNode = nodesToScore[pt].parentNode;
+      for (var pt = 0; pt < elementsToScore.length; pt += 1) {
+        var parentNode = elementsToScore[pt].parentNode;
        var grandParentNode = parentNode ? parentNode.parentNode : null;
-        var innerText = this._getInnerText(nodesToScore[pt]);
+        var innerText = this._getInnerText(elementsToScore[pt]);

        if (!parentNode || typeof(parentNode.tagName) === 'undefined')
          continue;
@ -612,15 +630,40 @@ Readability.prototype = {
        // Move all of the page's children into topCandidate
        topCandidate = doc.createElement("DIV");
        neededToCreateTopCandidate = true;
-        var children = page.childNodes;
-        while (children.length) {
-          this.log("Moving child out:", children[0]);
-          topCandidate.appendChild(children[0]);
+        // Move everything (not just elements, also text nodes etc.) into the container
+        // so we even include text directly in the body:
+        var kids = page.childNodes;
+        while (kids.length) {
+          this.log("Moving child out:", kids[0]);
+          topCandidate.appendChild(kids[0]);
        }

        page.appendChild(topCandidate);

        this._initializeNode(topCandidate);
+      } else if (topCandidate) {
+        // Because of our bonus system, parents of candidates might have scores
+        // themselves. They get half of the node. There won't be nodes with higher
+        // scores than our topCandidate, but if we see the score going *up* in the first
+        // few steps up the tree, that's a decent sign that there might be more content
+        // lurking in other places that we want to unify in. The sibling stuff
+        // below does some of that - but only if we've looked high enough up the DOM
+        // tree.
+        var parentOfTopCandidate = topCandidate.parentNode;
+        // The scores shouldn't get too low.
+        var scoreThreshold = topCandidate.readability.contentScore / 3;
+        var lastScore = parentOfTopCandidate.readability.contentScore;
+        while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+          var parentScore = parentOfTopCandidate.readability.contentScore;
+          if (parentScore < scoreThreshold)
+            break;
+          if (parentScore > lastScore) {
+            // Alright! We found a better parent to use.
+            topCandidate = parentOfTopCandidate;
+            break;
+          }
+          parentOfTopCandidate = parentOfTopCandidate.parentNode;
+        }
      }

      // Now that we have the top candidate, look through its siblings for content
@ -631,31 +674,30 @@ Readability.prototype = {
        articleContent.id = "readability-content";

      var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
-      var siblingNodes = topCandidate.parentNode.childNodes;
+      var siblings = topCandidate.parentNode.children;

-      for (var s = 0, sl = siblingNodes.length; s < sl; s += 1) {
-        var siblingNode = siblingNodes[s];
+      for (var s = 0, sl = siblings.length; s < sl; s++) {
+        var sibling = siblings[s];
        var append = false;

-        this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : ''));
-        this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
+        this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
+        this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');

-        if (siblingNode === topCandidate)
+        if (sibling === topCandidate) {
          append = true;
-
+        } else {
          var contentBonus = 0;

          // Give a bonus if sibling nodes and top candidates have the example same classname
-        if (siblingNode.className === topCandidate.className && topCandidate.className !== "")
+          if (sibling.className === topCandidate.className && topCandidate.className !== "")
            contentBonus += topCandidate.readability.contentScore * 0.2;

-        if (typeof siblingNode.readability !== 'undefined' &&
-          (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
+          if (sibling.readability &&
+              ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
            append = true;
-
-        if (siblingNode.nodeName === "P") {
-          var linkDensity = this._getLinkDensity(siblingNode);
-          var nodeContent = this._getInnerText(siblingNode);
+          } else if (sibling.nodeName === "P") {
+            var linkDensity = this._getLinkDensity(sibling);
+            var nodeContent = this._getInnerText(sibling);
            var nodeLength = nodeContent.length;

            if (nodeLength > 80 && linkDensity < 0.25) {
@ -664,38 +706,38 @@ Readability.prototype = {
              append = true;
            }
          }
+        }

        if (append) {
-          this.log("Appending node:", siblingNode);
+          this.log("Appending node:", sibling);

-          // siblingNodes is a reference to the childNodes array, and
-          // siblingNode is removed from the array when we call appendChild()
-          // below. As a result, we must revisit this index since the nodes
-          // have been shifted.
-          s -= 1;
-          sl -= 1;
-
-          if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
+          if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
            // We have a node that isn't a common block level element, like a form or td tag.
-            // Turn it into a div so it doesn't get filtered out later by accident. */
-            this.log("Altering siblingNode:", siblingNode, 'to div.');
+            // Turn it into a div so it doesn't get filtered out later by accident.
+            this.log("Altering sibling:", sibling, 'to div.');

-            this._setNodeTag(siblingNode, "DIV");
+            this._setNodeTag(sibling, "DIV");
          }

          // To ensure a node does not interfere with readability styles,
          // remove its classnames.
-          siblingNode.removeAttribute("class");
+          sibling.removeAttribute("class");

-          // Append sibling and subtract from our list because it removes
-          // the node when you append to another node.
-          articleContent.appendChild(siblingNode);
+          articleContent.appendChild(sibling);
+          // siblings is a reference to the children array, and
+          // sibling is removed from the array when we call appendChild().
+          // As a result, we must revisit this index since the nodes
+          // have been shifted.
+          s -= 1;
+          sl -= 1;
        }
      }

+      if (this.ENABLE_LOGGING)
        this.log("Article content pre-prep: " + articleContent.innerHTML);
      // So we have all of the content that we need. Now we clean it up for presentation.
      this._prepArticle(articleContent);
+      if (this.ENABLE_LOGGING)
        this.log("Article content post-prep: " + articleContent.innerHTML);

      if (this._curPageNum === 1) {
@ -718,6 +760,7 @@ Readability.prototype = {
        }
      }

+      if (this.ENABLE_LOGGING)
        this.log("Article content after paging: " + articleContent.innerHTML);

      // Now that we've gone through the full algorithm, check to see if
@ -760,19 +803,12 @@ Readability.prototype = {
  },

  /**
-   * Attempts to get the excerpt from these
-   * sources in the following order:
-   * - meta description tag
-   * - open-graph description
-   * - twitter cards description
-   * - article's first paragraph
-   * If no excerpt is found, an empty string will be
-   * returned.
+   * Attempts to get excerpt and byline metadata for the article.
   * 
-   * @param Element - root element of the processed version page
-   * @return String - excerpt of the article
-  **/
-  _getExcerpt: function(articleContent) {
+   * @return Object with optional "excerpt" and "byline" properties
+   */
+  _getArticleMetadata: function() {
+    var metadata = {};
    var values = {};
    var metaElements = this._doc.getElementsByTagName("meta");

@ -789,7 +825,12 @@ Readability.prototype = {
      var elementName = element.getAttribute("name");
      var elementProperty = element.getAttribute("property");

-      var name;
+      if (elementName === "author") {
+        metadata.byline = element.getAttribute("content");
+        continue;
+      }
+
+      var name = null;
      if (namePattern.test(elementName)) {
        name = elementName;
      } else if (propertyPattern.test(elementProperty)) {
@ -808,26 +849,16 @@ Readability.prototype = {
    }

    if ("description" in values) {
-      return values["description"];
-    }
-
-    if ("og:description" in values) {
+      metadata.excerpt = values["description"];
+    } else if ("og:description" in values) {
      // Use facebook open graph description.
-      return values["og:description"];
-    }
-
-    if ("twitter:description" in values) {
+      metadata.excerpt = values["og:description"];
+    } else if ("twitter:description" in values) {
      // Use twitter cards description.
-      return values["twitter:description"];
+      metadata.excerpt = values["twitter:description"];
    }

-    // No description meta tags, use the article's first paragraph.
-    var paragraphs = articleContent.getElementsByTagName("p");
-    if (paragraphs.length > 0) {
-      return paragraphs[0].textContent;
-    }
-
-    return "";
+    return metadata;
  },

  /**
@ -847,33 +878,28 @@ Readability.prototype = {
  },

  /**
-   * Get child index of the only P element inside a DIV with no
-   * text content. Returns -1 if the DIV node contains non-empty
-   * text nodes or if it contains other element nodes.
+   * Check if this node has only whitespace and a single P element
+   * Returns false if the DIV node contains non-empty text nodes
+   * or if it contains no P or more than 1 element.
   *
   * @param Element
  **/
-  _getSinglePIndexInsideDiv: function(e) {
+  _hasSinglePInsideElement: function(e) {
+    // There should be exactly 1 element child which is a P:
+    if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
+      return false;
+    }
+    // And there should be no text nodes with real content
    var childNodes = e.childNodes;
-    var pIndex = -1;
-
    for (var i = childNodes.length; --i >= 0;) {
      var node = childNodes[i];
-
-      if (node.nodeType === Node.ELEMENT_NODE) {
-        if (node.tagName !== "P")
-          return -1;
-
-        if (pIndex >= 0)
-          return -1;
-
-        pIndex = i;
-      } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
-        return -1;
+      if (node.nodeType == Node.TEXT_NODE &&
+          this.REGEXPS.hasContent.test(node.textContent)) {
+        return false;
      }
    }

-    return pIndex;
+    return true;
  },

  /**
@ -882,12 +908,9 @@ Readability.prototype = {
   * @param Element
   */
  _hasChildBlockElement: function (e) {
-    var length = e.childNodes.length;
+    var length = e.children.length;
    for (var i = 0; i < length; i++) {
-      var child = e.childNodes[i];
-      if (child.nodeType != 1)
-        continue;
-
+      var child = e.children[i];
      if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
        return true;
    }
@ -902,7 +925,7 @@ Readability.prototype = {
   * @return string
  **/
  _getInnerText: function(e, normalizeSpaces) {
-    var textContent = e.textContent.replace(this.REGEXPS.trim, "");
+    var textContent = e.textContent.trim();
    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;

    if (normalizeSpaces) {
@ -933,10 +956,9 @@ Readability.prototype = {
  **/
  _cleanStyles: function(e) {
    e = e || this._doc;
-    var cur = e.firstChild;
-
    if (!e)
      return;
+    var cur = e.firstChild;

    // Remove any root styles, if we're able.
    if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
@ -944,7 +966,7 @@ Readability.prototype = {

    // Go until there are no more child nodes
    while (cur !== null) {
-      if (cur.nodeType === 1) {
+      if (cur.nodeType === cur.ELEMENT_NODE) {
        // Remove style attribute(s) :
        if (cur.className !== "readability-styled")
          cur.removeAttribute("style");
@ -1355,19 +1377,19 @@ Readability.prototype = {

    // Look for a special classname
    if (typeof(e.className) === 'string' && e.className !== '') {
-      if (e.className.search(this.REGEXPS.negative) !== -1)
+      if (this.REGEXPS.negative.test(e.className))
        weight -= 25;

-      if (e.className.search(this.REGEXPS.positive) !== -1)
+      if (this.REGEXPS.positive.test(e.className))
        weight += 25;
    }

    // Look for a special ID
    if (typeof(e.id) === 'string' && e.id !== '') {
-      if (e.id.search(this.REGEXPS.negative) !== -1)
+      if (this.REGEXPS.negative.test(e.id))
        weight -= 25;

-      if (e.id.search(this.REGEXPS.positive) !== -1)
+      if (this.REGEXPS.positive.test(e.id))
        weight += 25;
    }

@ -1395,11 +1417,11 @@ Readability.prototype = {
        }

        // First, check the elements attributes to see if any of them contain youtube or vimeo
-        if (attributeValues.search(this.REGEXPS.videos) !== -1)
+        if (this.REGEXPS.videos.test(attributeValues))
          continue;

        // Then check the elements inside this element for the same.
-        if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1)
+        if (this.REGEXPS.videos.test(targetList[y].innerHTML))
          continue;
      }

@ -1445,7 +1467,7 @@ Readability.prototype = {
        var embedCount = 0;
        var embeds = tagsList[i].getElementsByTagName("embed");
        for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
-          if (embeds[ei].src.search(this.REGEXPS.videos) === -1)
+          if (!this.REGEXPS.videos.test(embeds[ei].src))
            embedCount += 1;
        }

@ -1532,6 +1554,8 @@ Readability.prototype = {
    this._prepDocument();

    var articleTitle = this._getArticleTitle();
+    var metadata = this._getArticleMetadata();
+
    var articleContent = this._grabArticle();
    if (!articleContent)
      return null;
@ -1548,14 +1572,22 @@ Readability.prototype = {
    //   }).bind(this), 500);
    // }

-    var excerpt = this._getExcerpt(articleContent);
+    // If we haven't found an excerpt in the article's metadata, use the article's
+    // first paragraph as the excerpt. This is used for displaying a preview of
+    // the article's content.
+    if (!metadata.excerpt) {
+      var paragraphs = articleContent.getElementsByTagName("p");
+      if (paragraphs.length > 0) {
+        metadata.excerpt = paragraphs[0].textContent;
+      }
+    }

    return { uri: this._uri,
             title: articleTitle,
-             byline: this._articleByline,
+             byline: metadata.byline || this._articleByline,
             dir: this._articleDir,
             content: articleContent.innerHTML,
             length: articleContent.textContent.length,
-             excerpt: excerpt };
+             excerpt: metadata.excerpt };
  }
 };