No bug: update readability libs to the up-to-date github versions to include significant perf and quality improvements, rs=me

--HG--
extra : rebase_source : 464cf93b5110cc456454bab0b698bc10d32bea49
This commit is contained in:
Gijs Kruitbosch 2015-03-20 20:50:45 -07:00
Родитель e771ae69ec
Коммит 82c7c6de1e
2 изменённых файлов: 448 добавлений и 278 удалений

Просмотреть файл

@ -1,3 +1,10 @@
/*
* DO NOT MODIFY THIS FILE DIRECTLY!
*
* This is a shared library that is maintained in an external repo:
* https://github.com/mozilla/readability
*/
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
@ -31,7 +38,7 @@
}
// When a style is set in JS, map it to the corresponding CSS attribute
let styleMap = {
var styleMap = {
"alignmentBaseline": "alignment-baseline",
"background": "background",
"backgroundAttachment": "background-attachment",
@ -223,7 +230,7 @@
};
// Elements that can be self-closing
let voidElems = {
var voidElems = {
"area": true,
"base": true,
"br": true,
@ -239,8 +246,10 @@
"source": true,
};
var whitespace = [" ", "\t", "\n", "\r"];
// See http://www.w3schools.com/dom/dom_nodetype.asp
let nodeTypes = {
var nodeTypes = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
@ -257,14 +266,12 @@
function getElementsByTagName(tag) {
tag = tag.toUpperCase();
let elems = [];
let allTags = (tag === "*");
var elems = [];
var allTags = (tag === "*");
function getElems(node) {
let length = node.childNodes.length;
for (let i = 0; i < length; i++) {
let child = node.childNodes[i];
if (child.nodeType !== 1)
continue;
var length = node.children.length;
for (var i = 0; i < length; i++) {
var child = node.children[i];
if (allTags || (child.tagName === tag))
elems.push(child);
getElems(child);
@ -274,7 +281,7 @@
return elems;
}
let Node = function () {};
var Node = function () {};
Node.prototype = {
attributes: null,
@ -283,18 +290,23 @@
nodeName: null,
parentNode: null,
textContent: null,
nextSibling: null,
previousSibling: null,
get firstChild() {
return this.childNodes[0] || null;
},
get nextSibling() {
if (this.parentNode) {
let childNodes = this.parentNode.childNodes;
return childNodes[childNodes.indexOf(this) + 1] || null;
}
get firstElementChild() {
return this.children[0] || null;
},
return null;
get lastChild() {
return this.childNodes[this.childNodes.length - 1] || null;
},
get lastElementChild() {
return this.children[this.children.length - 1] || null;
},
appendChild: function (child) {
@ -302,48 +314,152 @@
child.parentNode.removeChild(child);
}
var last = this.lastChild;
if (last)
last.nextSibling = child;
child.previousSibling = last;
if (child.nodeType === Node.ELEMENT_NODE) {
child.previousElementSibling = this.children[this.children.length - 1] || null;
this.children.push(child);
child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
}
this.childNodes.push(child);
child.parentNode = this;
},
removeChild: function (child) {
let childNodes = this.childNodes;
let childIndex = childNodes.indexOf(child);
var childNodes = this.childNodes;
var childIndex = childNodes.indexOf(child);
if (childIndex === -1) {
throw "removeChild: node not found";
} else {
child.parentNode = null;
var prev = child.previousSibling;
var next = child.nextSibling;
if (prev)
prev.nextSibling = next;
if (next)
next.previousSibling = prev;
if (child.nodeType === Node.ELEMENT_NODE) {
prev = child.previousElementSibling;
next = child.nextElementSibling;
if (prev)
prev.nextElementSibling = next;
if (next)
next.previousElementSibling = prev;
this.children.splice(this.children.indexOf(child), 1);
}
child.previousSibling = child.nextSibling = null;
child.previousElementSibling = child.nextElementSibling = null;
return childNodes.splice(childIndex, 1)[0];
}
},
replaceChild: function (newNode, oldNode) {
let childNodes = this.childNodes;
let childIndex = childNodes.indexOf(oldNode);
var childNodes = this.childNodes;
var childIndex = childNodes.indexOf(oldNode);
if (childIndex === -1) {
throw "replaceChild: node not found";
} else {
// This will take care of updating the new node if it was somewhere else before:
if (newNode.parentNode)
newNode.parentNode.removeChild(newNode);
childNodes[childIndex] = newNode;
// update the new node's sibling properties, and its new siblings' sibling properties
newNode.nextSibling = oldNode.nextSibling;
newNode.previousSibling = oldNode.previousSibling;
if (newNode.nextSibling)
newNode.nextSibling.previousSibling = newNode;
if (newNode.previousSibling)
newNode.previousSibling.nextSibling = newNode;
newNode.parentNode = this;
// Now deal with elements before we clear out those values for the old node,
// because it can help us take shortcuts here:
if (newNode.nodeType === Node.ELEMENT_NODE) {
if (oldNode.nodeType === Node.ELEMENT_NODE) {
// Both were elements, which makes this easier, we just swap things out:
newNode.previousElementSibling = oldNode.previousElementSibling;
newNode.nextElementSibling = oldNode.nextElementSibling;
if (newNode.previousElementSibling)
newNode.previousElementSibling.nextElementSibling = newNode;
if (newNode.nextElementSibling)
newNode.nextElementSibling.previousElementSibling = newNode;
this.children[this.children.indexOf(oldNode)] = newNode;
} else {
// Hard way:
newNode.previousElementSibling = (function() {
for (var i = childIndex - 1; i >= 0; i--) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
if (newNode.previousElementSibling) {
newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
} else {
newNode.nextElementSibling = (function() {
for (var i = childIndex + 1; i < childNodes.length; i++) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
}
if (newNode.previousElementSibling)
newNode.previousElementSibling.nextElementSibling = newNode;
if (newNode.nextElementSibling)
newNode.nextElementSibling.previousElementSibling = newNode;
if (newNode.nextElementSibling)
this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
else
this.children.push(newNode);
}
} else {
// new node is not an element node.
// if the old one was, update its element siblings:
if (oldNode.nodeType === Node.ELEMENT_NODE) {
if (oldNode.previousElementSibling)
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
if (oldNode.nextElementSibling)
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
this.children.splice(this.children.indexOf(oldNode), 1);
}
// If the old node wasn't an element, neither the new nor the old node was an element,
// and the children array and its members shouldn't need any updating.
}
oldNode.parentNode = null;
oldNode.previousSibling = null;
oldNode.nextSibling = null;
if (oldNode.nodeType === Node.ELEMENT_NODE) {
oldNode.previousElementSibling = null;
oldNode.nextElementSibling = null;
}
return oldNode;
}
}
};
for (let i in nodeTypes) {
for (var i in nodeTypes) {
Node[i] = Node.prototype[i] = nodeTypes[i];
}
let Attribute = function (name, value) {
var Attribute = function (name, value) {
this.name = name;
this.value = value;
};
let Comment = function () {
var Comment = function () {
this.childNodes = [];
};
@ -354,7 +470,7 @@
nodeType: Node.COMMENT_NODE
};
let Text = function () {
var Text = function () {
this.childNodes = [];
};
@ -366,9 +482,10 @@
textContent: ""
}
let Document = function () {
var Document = function () {
this.styleSheets = [];
this.childNodes = [];
this.children = [];
};
Document.prototype = {
@ -382,11 +499,11 @@
getElementById: function (id) {
function getElem(node) {
let length = node.childNodes.length;
var length = node.children.length;
if (node.id === id)
return node;
for (let i = 0; i < length; i++) {
let el = getElem(node.childNodes[i]);
for (var i = 0; i < length; i++) {
var el = getElem(node.children[i]);
if (el)
return el;
}
@ -396,14 +513,16 @@
},
createElement: function (tag) {
let node = new Element(tag);
var node = new Element(tag);
return node;
}
};
let Element = function (tag) {
var Element = function (tag) {
this.attributes = [];
this.childNodes = [];
this.children = [];
this.nextElementSibling = this.previousElementSibling = null;
this.localName = tag.toLowerCase();
this.tagName = tag.toUpperCase();
this.style = new Style(this);
@ -454,16 +573,16 @@
get innerHTML() {
function getHTML(node) {
let i = 0;
var i = 0;
for (i = 0; i < node.childNodes.length; i++) {
let child = node.childNodes[i];
var child = node.childNodes[i];
if (child.localName) {
arr.push("<" + child.localName);
// serialize attribute list
for (let j = 0; j < child.attributes.length; j++) {
let attr = child.attributes[j];
let quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
for (var j = 0; j < child.attributes.length; j++) {
var attr = child.attributes[j];
var quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
arr.push(" " + attr.name + '=' + quote + attr.value + quote);
}
@ -484,30 +603,30 @@
// Using Array.join() avoids the overhead from lazy string concatenation.
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
let arr = [];
var arr = [];
getHTML(this);
return arr.join("");
},
set innerHTML(html) {
let parser = new JSDOMParser();
let node = parser.parse(html);
for (let i = this.childNodes.length; --i >= 0;) {
var parser = new JSDOMParser();
var node = parser.parse(html);
for (var i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = null;
}
this.childNodes = node.childNodes;
for (let i = this.childNodes.length; --i >= 0;) {
for (var i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = this;
}
},
set textContent(text) {
// clear parentNodes for existing children
for (let i = this.childNodes.length; --i >= 0;) {
for (var i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = null;
}
let node = new Text();
var node = new Text();
this.childNodes = [ node ];
node.textContent = text;
node.parentNode = this;
@ -515,9 +634,9 @@
get textContent() {
function getText(node) {
let nodes = node.childNodes;
for (let i = 0; i < nodes.length; i++) {
let child = nodes[i];
var nodes = node.childNodes;
for (var i = 0; i < nodes.length; i++) {
var child = nodes[i];
if (child.nodeType === 3) {
text.push(child.textContent);
} else {
@ -528,14 +647,14 @@
// Using Array.join() avoids the overhead from lazy string concatenation.
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
let text = [];
var text = [];
getText(this);
return text.join("");
},
getAttribute: function (name) {
for (let i = this.attributes.length; --i >= 0;) {
let attr = this.attributes[i];
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name)
return attr.value;
}
@ -543,8 +662,8 @@
},
setAttribute: function (name, value) {
for (let i = this.attributes.length; --i >= 0;) {
let attr = this.attributes[i];
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
attr.value = value;
return;
@ -554,8 +673,8 @@
},
removeAttribute: function (name) {
for (let i = this.attributes.length; --i >= 0;) {
let attr = this.attributes[i];
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
this.attributes.splice(i, 1);
break;
@ -564,7 +683,7 @@
}
};
let Style = function (node) {
var Style = function (node) {
this.node = node;
};
@ -575,14 +694,14 @@
// manipulations, so this should be okay.
Style.prototype = {
getStyle: function (styleName) {
let attr = this.node.getAttribute("style");
var attr = this.node.getAttribute("style");
if (!attr)
return undefined;
let styles = attr.split(";");
for (let i = 0; i < styles.length; i++) {
let style = styles[i].split(":");
let name = style[0].trim();
var styles = attr.split(";");
for (var i = 0; i < styles.length; i++) {
var style = styles[i].split(":");
var name = style[0].trim();
if (name === styleName)
return style[1].trim();
}
@ -591,12 +710,12 @@
},
setStyle: function (styleName, styleValue) {
let value = this.node.getAttribute("style") || "";
let index = 0;
var value = this.node.getAttribute("style") || "";
var index = 0;
do {
let next = value.indexOf(";", index) + 1;
let length = next - index - 1;
let style = (length > 0 ? value.substr(index, length) : value.substr(index));
var next = value.indexOf(";", index) + 1;
var length = next - index - 1;
var style = (length > 0 ? value.substr(index, length) : value.substr(index));
if (style.substr(0, style.indexOf(":")).trim() === styleName) {
value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : "");
break;
@ -611,7 +730,7 @@
// For each item in styleMap, define a getter and setter on the style
// property.
for (let jsName in styleMap) {
for (var jsName in styleMap) {
(function (cssName) {
Style.prototype.__defineGetter__(jsName, function () {
return this.getStyle(cssName);
@ -622,7 +741,7 @@
}) (styleMap[jsName]);
}
let JSDOMParser = function () {
var JSDOMParser = function () {
this.currentChar = 0;
// In makeElementNode() we build up many strings one char at a time. Using
@ -659,8 +778,8 @@
* character and returns the text string in between.
*/
readString: function (quote) {
let str;
let n = this.html.indexOf(quote, this.currentChar);
var str;
var n = this.html.indexOf(quote, this.currentChar);
if (n === -1) {
this.currentChar = this.html.length;
str = null;
@ -677,9 +796,9 @@
* pair and adds the result to the attributes list.
*/
readAttribute: function (node) {
let name = "";
var name = "";
let n = this.html.indexOf("=", this.currentChar);
var n = this.html.indexOf("=", this.currentChar);
if (n === -1) {
this.currentChar = this.html.length;
} else {
@ -692,14 +811,14 @@
return;
// After a '=', we should see a '"' for the attribute value
let c = this.nextChar();
var c = this.nextChar();
if (c !== '"' && c !== "'") {
error("expecting '\"'");
error("Error reading attribute " + name + ", expecting '\"'");
return;
}
// Read the attribute value (and consume the matching quote)
let value = this.readString(c);
var value = this.readString(c);
if (!value)
return;
@ -718,29 +837,30 @@
* Element
*/
makeElementNode: function (retPair) {
let c = this.nextChar();
var c = this.nextChar();
// Read the Element tag name
let strBuf = this.strBuf;
var strBuf = this.strBuf;
strBuf.length = 0;
while (c !== " " && c !== ">" && c !== "/") {
while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") {
if (c === undefined)
return false;
strBuf.push(c);
c = this.nextChar();
}
let tag = strBuf.join('');
var tag = strBuf.join('');
if (!tag)
return false;
let node = new Element(tag);
var node = new Element(tag);
// Read Element attributes
while (c !== "/" && c !== ">") {
if (c === undefined)
return false;
while (this.match(" "));
while (whitespace.indexOf(this.html[this.currentChar++]) != -1);
this.currentChar--;
c = this.nextChar();
if (c !== "/" && c !== ">") {
--this.currentChar;
@ -749,12 +869,12 @@
}
// If this is a self-closing tag, read '/>'
let closed = tag in voidElems;
var closed = tag in voidElems;
if (c === "/") {
closed = true;
c = this.nextChar();
if (c !== ">") {
error("expected '>'");
error("expected '>' to close " + tag);
return false;
}
}
@ -771,7 +891,7 @@
* @returns whether input matched string
*/
match: function (str) {
let strlen = str.length;
var strlen = str.length;
if (this.html.substr(this.currentChar, strlen) === str) {
this.currentChar += strlen;
return true;
@ -784,7 +904,7 @@
* and including the matched string.
*/
discardTo: function (str) {
let index = this.html.indexOf(str, this.currentChar) + str.length;
var index = this.html.indexOf(str, this.currentChar) + str.length;
if (index === -1)
this.currentChar = this.html.length;
this.currentChar = index;
@ -794,16 +914,27 @@
* Reads child nodes for the given node.
*/
readChildren: function (node) {
let child;
var child;
while ((child = this.readNode())) {
// Don't keep Comment nodes
if (child.nodeType !== 8) {
node.childNodes.push(child);
child.parentNode = node;
node.appendChild(child);
}
}
},
readScript: function (node) {
var index = this.html.indexOf("</script>", this.currentChar);
if (index === -1) {
index = this.html.length;
}
var txt = new Text();
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
node.appendChild(txt);
this.currentChar = index;
},
/**
* Reads the next child node from the input. If we're reading a closing
* tag, or if we've reached the end of input, return null.
@ -811,7 +942,7 @@
* @returns the node
*/
readNode: function () {
let c = this.nextChar();
var c = this.nextChar();
if (c === undefined)
return null;
@ -819,8 +950,8 @@
// Read any text as Text node
if (c !== "<") {
--this.currentChar;
let node = new Text();
let n = this.html.indexOf("<", this.currentChar);
var node = new Text();
var n = this.html.indexOf("<", this.currentChar);
if (n === -1) {
node.textContent = this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
@ -842,7 +973,7 @@
if (this.match("--")) {
this.discardTo("-->");
} else {
let c = this.nextChar();
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
@ -862,25 +993,32 @@
}
// Otherwise, we're looking at an Element node
let result = this.makeElementNode(this.retPair);
var result = this.makeElementNode(this.retPair);
if (!result)
return null;
let node = this.retPair[0];
let closed = this.retPair[1];
let localName = node.localName;
var node = this.retPair[0];
var closed = this.retPair[1];
var localName = node.localName;
// If this isn't a void Element, read its child nodes
if (!closed) {
if (localName == "script") {
this.readScript(node);
} else {
this.readChildren(node);
let closingTag = "</" + localName + ">";
}
var closingTag = "</" + localName + ">";
if (!this.match(closingTag)) {
error("expected '" + closingTag + "'");
return null;
}
}
if (localName === "title") {
// Only use the first title, because SVG might have other
// title elements which we don't care about (medium.com
// does this, at least).
if (localName === "title" && !this.doc.title) {
this.doc.title = node.textContent.trim();
} else if (localName === "head") {
this.doc.head = node;
@ -898,14 +1036,14 @@
*/
parse: function (html) {
this.html = html;
let doc = this.doc = new Document();
var doc = this.doc = new Document();
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the
// <html> node
if (doc.documentElement) {
for (let i = doc.childNodes.length; --i >= 0;) {
let child = doc.childNodes[i];
for (var i = doc.childNodes.length; --i >= 0;) {
var child = doc.childNodes[i];
if (child !== doc.documentElement) {
doc.removeChild(child);
}

Просмотреть файл

@ -102,16 +102,18 @@ Readability.prototype = {
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/
whitespace: /^\s*$/,
hasContent: /\S$/,
},
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
/**
* Run any post-process modifications to article content as necessary.
*
@ -204,7 +206,7 @@ Readability.prototype = {
curTitle = this._getInnerText(hOnes[0]);
}
curTitle = curTitle.replace(this.REGEXPS.trim, "");
curTitle = curTitle.trim();
if (curTitle.split(' ').length <= 4)
curTitle = origTitle;
@ -223,8 +225,8 @@ Readability.prototype = {
// Remove all style tags in head
var styleTags = doc.getElementsByTagName("style");
for (var st = 0; st < styleTags.length; st += 1) {
styleTags[st].textContent = "";
for (var st = styleTags.length - 1; st >= 0; st -= 1) {
styleTags[st].parentNode.removeChild(styleTags[st]);
}
if (doc.body) {
@ -305,6 +307,8 @@ Readability.prototype = {
},
_setNodeTag: function (node, tag) {
// FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
// won't actually be set).
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
},
@ -407,6 +411,54 @@ Readability.prototype = {
node.readability.contentScore += this._getClassWeight(node);
},
_removeAndGetNext: function(node) {
var nextNode = this._getNextNode(node, true);
node.parentNode.removeChild(node);
return nextNode;
},
/**
* Traverse the DOM from node to node, starting at the node passed in.
* Pass true for the second parameter to indicate this node itself
* (and its kids) are going away, and we want the next node over.
*
* Calling this in a loop will traverse the DOM depth-first.
*/
_getNextNode: function(node, ignoreSelfAndKids) {
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.firstElementChild) {
return node.firstElementChild;
}
// Then for siblings...
if (node.nextElementSibling) {
return node.nextElementSibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
} while (node && !node.nextElementSibling);
return node && node.nextElementSibling;
},
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
}
if (node.getAttribute !== undefined) {
var rel = node.getAttribute("rel");
}
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
this._articleByline = node.textContent.trim();
return true;
}
return false;
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@ -430,65 +482,37 @@ Readability.prototype = {
// Check if any "dir" is set on the toplevel document element
this._articleDir = doc.documentElement.getAttribute("dir");
//helper function used below in the 'while' loop:
function purgeNode(node, allElements) {
for (var i = node.childNodes.length; --i >= 0;) {
purgeNode(node.childNodes[i], allElements);
}
if (node._index !== undefined && allElements[node._index] == node)
delete allElements[node._index];
}
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
var allElements = page.getElementsByTagName('*');
// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name "comment", etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
//
// Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
// TODO: Shouldn't this be a reverse traversal?
var node = null;
var nodesToScore = [];
var elementsToScore = [];
var node = this._doc.documentElement;
// var each node know its index in the allElements array.
for (var i = allElements.length; --i >= 0;) {
allElements[i]._index = i;
}
while (node) {
var matchString = node.className + " " + node.id;
/**
* JSDOMParser returns static node lists, not live ones. When we remove
* an element from the document, we need to manually remove it - and all
* of its children - from the allElements array.
*/
for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) {
if (!(node = allElements[nodeIndex]))
// Check to see if this node is a byline, and remove it if it is.
if (this._checkByline(node, matchString)) {
node = this._removeAndGetNext(node);
continue;
var matchString = node.className + node.id;
if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) {
if (this._isValidByline(node.textContent)) {
this._articleByline = node.textContent.trim();
node.parentNode.removeChild(node);
purgeNode(node, allElements);
continue;
}
}
// Remove unlikely candidates
if (stripUnlikelyCandidates) {
if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 &&
matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 &&
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
node.tagName !== "BODY") {
this.log("Removing unlikely candidate - " + matchString);
node.parentNode.removeChild(node);
purgeNode(node, allElements);
node = this._removeAndGetNext(node);
continue;
}
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
nodesToScore[nodesToScore.length] = node;
elementsToScore.push(node);
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
@ -496,34 +520,28 @@ Readability.prototype = {
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
var pIndex = this._getSinglePIndexInsideDiv(node);
if (pIndex >= 0 || !this._hasChildBlockElement(node)) {
if (pIndex >= 0) {
var newNode = node.childNodes[pIndex];
if (this._hasSinglePInsideElement(node)) {
var newNode = node.firstElementChild;
node.parentNode.replaceChild(newNode, node);
purgeNode(node, allElements);
} else {
node = newNode;
} else if (!this._hasChildBlockElement(node)) {
this._setNodeTag(node, "P");
nodesToScore[nodesToScore.length] = node;
}
elementsToScore.push(node);
} else {
// EXPERIMENTAL
for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
var childNode = node.childNodes[i];
if (!childNode)
continue;
if (childNode.nodeType === 3) { // Node.TEXT_NODE
if (childNode.nodeType === Node.TEXT_NODE) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
node.replaceChild(p, childNode);
}
}
}
}
node = this._getNextNode(node);
}
/**
@ -533,10 +551,10 @@ Readability.prototype = {
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
for (var pt = 0; pt < nodesToScore.length; pt += 1) {
var parentNode = nodesToScore[pt].parentNode;
for (var pt = 0; pt < elementsToScore.length; pt += 1) {
var parentNode = elementsToScore[pt].parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = this._getInnerText(nodesToScore[pt]);
var innerText = this._getInnerText(elementsToScore[pt]);
if (!parentNode || typeof(parentNode.tagName) === 'undefined')
continue;
@ -612,15 +630,40 @@ Readability.prototype = {
// Move all of the page's children into topCandidate
topCandidate = doc.createElement("DIV");
neededToCreateTopCandidate = true;
var children = page.childNodes;
while (children.length) {
this.log("Moving child out:", children[0]);
topCandidate.appendChild(children[0]);
// Move everything (not just elements, also text nodes etc.) into the container
// so we even include text directly in the body:
var kids = page.childNodes;
while (kids.length) {
this.log("Moving child out:", kids[0]);
topCandidate.appendChild(kids[0]);
}
page.appendChild(topCandidate);
this._initializeNode(topCandidate);
} else if (topCandidate) {
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
// few steps up the tree, that's a decent sign that there might be more content
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
var parentOfTopCandidate = topCandidate.parentNode;
// The scores shouldn't get too low.
var scoreThreshold = topCandidate.readability.contentScore / 3;
var lastScore = parentOfTopCandidate.readability.contentScore;
while (parentOfTopCandidate && parentOfTopCandidate.readability) {
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
if (parentScore > lastScore) {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate;
break;
}
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
}
// Now that we have the top candidate, look through its siblings for content
@ -631,31 +674,30 @@ Readability.prototype = {
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
var siblings = topCandidate.parentNode.children;
for (var s = 0, sl = siblingNodes.length; s < sl; s += 1) {
var siblingNode = siblingNodes[s];
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
var append = false;
this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : ''));
this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
if (siblingNode === topCandidate)
if (sibling === topCandidate) {
append = true;
} else {
var contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the example same classname
if (siblingNode.className === topCandidate.className && topCandidate.className !== "")
if (sibling.className === topCandidate.className && topCandidate.className !== "")
contentBonus += topCandidate.readability.contentScore * 0.2;
if (typeof siblingNode.readability !== 'undefined' &&
(siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
if (sibling.readability &&
((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
append = true;
if (siblingNode.nodeName === "P") {
var linkDensity = this._getLinkDensity(siblingNode);
var nodeContent = this._getInnerText(siblingNode);
} else if (sibling.nodeName === "P") {
var linkDensity = this._getLinkDensity(sibling);
var nodeContent = this._getInnerText(sibling);
var nodeLength = nodeContent.length;
if (nodeLength > 80 && linkDensity < 0.25) {
@ -664,38 +706,38 @@ Readability.prototype = {
append = true;
}
}
}
if (append) {
this.log("Appending node:", siblingNode);
this.log("Appending node:", sibling);
// siblingNodes is a reference to the childNodes array, and
// siblingNode is removed from the array when we call appendChild()
// below. As a result, we must revisit this index since the nodes
// have been shifted.
s -= 1;
sl -= 1;
if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident. */
this.log("Altering siblingNode:", siblingNode, 'to div.');
// Turn it into a div so it doesn't get filtered out later by accident.
this.log("Altering sibling:", sibling, 'to div.');
this._setNodeTag(siblingNode, "DIV");
this._setNodeTag(sibling, "DIV");
}
// To ensure a node does not interfere with readability styles,
// remove its classnames.
siblingNode.removeAttribute("class");
sibling.removeAttribute("class");
// Append sibling and subtract from our list because it removes
// the node when you append to another node.
articleContent.appendChild(siblingNode);
articleContent.appendChild(sibling);
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
// As a result, we must revisit this index since the nodes
// have been shifted.
s -= 1;
sl -= 1;
}
}
if (this.ENABLE_LOGGING)
this.log("Article content pre-prep: " + articleContent.innerHTML);
// So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent);
if (this.ENABLE_LOGGING)
this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) {
@ -718,6 +760,7 @@ Readability.prototype = {
}
}
if (this.ENABLE_LOGGING)
this.log("Article content after paging: " + articleContent.innerHTML);
// Now that we've gone through the full algorithm, check to see if
@ -760,19 +803,12 @@ Readability.prototype = {
},
/**
* Attempts to get the excerpt from these
* sources in the following order:
* - meta description tag
* - open-graph description
* - twitter cards description
* - article's first paragraph
* If no excerpt is found, an empty string will be
* returned.
* Attempts to get excerpt and byline metadata for the article.
*
* @param Element - root element of the processed version page
* @return String - excerpt of the article
**/
_getExcerpt: function(articleContent) {
* @return Object with optional "excerpt" and "byline" properties
*/
_getArticleMetadata: function() {
var metadata = {};
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
@ -789,7 +825,12 @@ Readability.prototype = {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
var name;
if (elementName === "author") {
metadata.byline = element.getAttribute("content");
continue;
}
var name = null;
if (namePattern.test(elementName)) {
name = elementName;
} else if (propertyPattern.test(elementProperty)) {
@ -808,26 +849,16 @@ Readability.prototype = {
}
if ("description" in values) {
return values["description"];
}
if ("og:description" in values) {
metadata.excerpt = values["description"];
} else if ("og:description" in values) {
// Use facebook open graph description.
return values["og:description"];
}
if ("twitter:description" in values) {
metadata.excerpt = values["og:description"];
} else if ("twitter:description" in values) {
// Use twitter cards description.
return values["twitter:description"];
metadata.excerpt = values["twitter:description"];
}
// No description meta tags, use the article's first paragraph.
var paragraphs = articleContent.getElementsByTagName("p");
if (paragraphs.length > 0) {
return paragraphs[0].textContent;
}
return "";
return metadata;
},
/**
@ -847,33 +878,28 @@ Readability.prototype = {
},
/**
* Get child index of the only P element inside a DIV with no
* text content. Returns -1 if the DIV node contains non-empty
* text nodes or if it contains other element nodes.
* Check if this node has only whitespace and a single P element
* Returns false if the DIV node contains non-empty text nodes
* or if it contains no P or more than 1 element.
*
* @param Element
**/
_getSinglePIndexInsideDiv: function(e) {
_hasSinglePInsideElement: function(e) {
// There should be exactly 1 element child which is a P:
if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
return false;
}
// And there should be no text nodes with real content
var childNodes = e.childNodes;
var pIndex = -1;
for (var i = childNodes.length; --i >= 0;) {
var node = childNodes[i];
if (node.nodeType === Node.ELEMENT_NODE) {
if (node.tagName !== "P")
return -1;
if (pIndex >= 0)
return -1;
pIndex = i;
} else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
return -1;
if (node.nodeType == Node.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent)) {
return false;
}
}
return pIndex;
return true;
},
/**
@ -882,12 +908,9 @@ Readability.prototype = {
* @param Element
*/
_hasChildBlockElement: function (e) {
var length = e.childNodes.length;
var length = e.children.length;
for (var i = 0; i < length; i++) {
var child = e.childNodes[i];
if (child.nodeType != 1)
continue;
var child = e.children[i];
if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
return true;
}
@ -902,7 +925,7 @@ Readability.prototype = {
* @return string
**/
_getInnerText: function(e, normalizeSpaces) {
var textContent = e.textContent.replace(this.REGEXPS.trim, "");
var textContent = e.textContent.trim();
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
if (normalizeSpaces) {
@ -933,10 +956,9 @@ Readability.prototype = {
**/
_cleanStyles: function(e) {
e = e || this._doc;
var cur = e.firstChild;
if (!e)
return;
var cur = e.firstChild;
// Remove any root styles, if we're able.
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
@ -944,7 +966,7 @@ Readability.prototype = {
// Go until there are no more child nodes
while (cur !== null) {
if (cur.nodeType === 1) {
if (cur.nodeType === cur.ELEMENT_NODE) {
// Remove style attribute(s) :
if (cur.className !== "readability-styled")
cur.removeAttribute("style");
@ -1355,19 +1377,19 @@ Readability.prototype = {
// Look for a special classname
if (typeof(e.className) === 'string' && e.className !== '') {
if (e.className.search(this.REGEXPS.negative) !== -1)
if (this.REGEXPS.negative.test(e.className))
weight -= 25;
if (e.className.search(this.REGEXPS.positive) !== -1)
if (this.REGEXPS.positive.test(e.className))
weight += 25;
}
// Look for a special ID
if (typeof(e.id) === 'string' && e.id !== '') {
if (e.id.search(this.REGEXPS.negative) !== -1)
if (this.REGEXPS.negative.test(e.id))
weight -= 25;
if (e.id.search(this.REGEXPS.positive) !== -1)
if (this.REGEXPS.positive.test(e.id))
weight += 25;
}
@ -1395,11 +1417,11 @@ Readability.prototype = {
}
// First, check the elements attributes to see if any of them contain youtube or vimeo
if (attributeValues.search(this.REGEXPS.videos) !== -1)
if (this.REGEXPS.videos.test(attributeValues))
continue;
// Then check the elements inside this element for the same.
if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1)
if (this.REGEXPS.videos.test(targetList[y].innerHTML))
continue;
}
@ -1445,7 +1467,7 @@ Readability.prototype = {
var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (embeds[ei].src.search(this.REGEXPS.videos) === -1)
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
@ -1532,6 +1554,8 @@ Readability.prototype = {
this._prepDocument();
var articleTitle = this._getArticleTitle();
var metadata = this._getArticleMetadata();
var articleContent = this._grabArticle();
if (!articleContent)
return null;
@ -1548,14 +1572,22 @@ Readability.prototype = {
// }).bind(this), 500);
// }
var excerpt = this._getExcerpt(articleContent);
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.
if (!metadata.excerpt) {
var paragraphs = articleContent.getElementsByTagName("p");
if (paragraphs.length > 0) {
metadata.excerpt = paragraphs[0].textContent;
}
}
return { uri: this._uri,
title: articleTitle,
byline: this._articleByline,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
length: articleContent.textContent.length,
excerpt: excerpt };
excerpt: metadata.excerpt };
}
};