Bug 1158228 - merge github's readability code into m-c, rs=me

--HG--
extra : rebase_source : 78e7e90941e849680ef1fdf2f409ea321b2317bb
This commit is contained in:
Margaret Leibovic 2015-05-01 12:20:00 -07:00
Родитель 60c90655c5
Коммит 5c78694bbc
1 изменённых файлов: 76 добавлений и 49 удалений

Просмотреть файл

@ -108,6 +108,9 @@ Readability.prototype = {
// it quits and just show a link.
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
@ -119,7 +122,7 @@ Readability.prototype = {
byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
@ -186,6 +189,15 @@ Readability.prototype = {
return Array.prototype.concat.apply([], nodeLists);
},
_getAllNodesWithTag: function(node, tagNames) {
if (node.querySelectorAll) {
return node.querySelectorAll(tagNames.join(','));
}
return [].concat.apply([], tagNames.map(function(tag) {
return node.getElementsByTagName(tag);
}));
},
/**
* Converts each <a> and <img> uri in the given element to an absolute URI.
*
@ -586,6 +598,18 @@ Readability.prototype = {
return false;
},
_getNodeAncestors: function(node, maxDepth) {
maxDepth = maxDepth || 0;
var i = 0, ancestors = [];
while (node.parentNode) {
ancestors.push(node.parentNode)
if (maxDepth && ++i === maxDepth)
break;
node = node.parentNode;
}
return ancestors;
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@ -640,8 +664,9 @@ Readability.prototype = {
}
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
@ -680,30 +705,18 @@ Readability.prototype = {
**/
var candidates = [];
this._forEachNode(elementsToScore, function(elementToScore) {
var parentNode = elementToScore.parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = this._getInnerText(elementToScore);
if (!parentNode || typeof(parentNode.tagName) === 'undefined')
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
return;
// If this paragraph is less than 25 characters, don't even count it.
var innerText = this._getInnerText(elementToScore);
if (innerText.length < 25)
return;
// Initialize readability data for the parent.
if (typeof parentNode.readability === 'undefined') {
this._initializeNode(parentNode);
candidates.push(parentNode);
}
// Initialize readability data for the grandparent.
if (grandParentNode &&
typeof(grandParentNode.readability) === 'undefined' &&
typeof(grandParentNode.tagName) !== 'undefined') {
this._initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
// Exclude nodes with no ancestor.
var ancestors = this._getNodeAncestors(elementToScore, 3);
if (ancestors.length === 0)
return;
var contentScore = 0;
@ -716,11 +729,18 @@ Readability.prototype = {
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Add the score to the parent. The grandparent gets half.
parentNode.readability.contentScore += contentScore;
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
if (!ancestor.tagName)
return;
if (grandParentNode)
grandParentNode.readability.contentScore += contentScore / 2;
if (typeof(ancestor.readability) === 'undefined') {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
});
});
// After we've calculated scores, loop through all of the possible
@ -848,10 +868,6 @@ Readability.prototype = {
sibling = this._setNodeTag(sibling, "DIV");
}
// To ensure a node does not interfere with readability styles,
// remove its classnames.
sibling.removeAttribute("class");
articleContent.appendChild(sibling);
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
@ -953,7 +969,7 @@ Readability.prototype = {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
if (elementName === "author") {
if ([elementName, elementProperty].indexOf("author") !== -1) {
metadata.byline = element.getAttribute("content");
return;
}
@ -1597,6 +1613,7 @@ Readability.prototype = {
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
@ -1632,13 +1649,13 @@ Readability.prototype = {
var toRemove = false;
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
} else if (li > p && tag !== "ul" && tag !== "ol") {
} else if (!isList && li > p) {
toRemove = true;
} else if ( input > Math.floor(p/3) ) {
} else if (input > Math.floor(p/3)) {
toRemove = true;
} else if (contentLength < 25 && (img === 0 || img > 2) ) {
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
toRemove = true;
} else if (weight < 25 && linkDensity > 0.2) {
} else if (!isList && weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > 0.5) {
toRemove = true;
@ -1663,7 +1680,7 @@ Readability.prototype = {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i = headers.length - 1; i >= 0; i -= 1) {
if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33)
if (this._getClassWeight(headers[i]) < 0)
headers[i].parentNode.removeChild(headers[i]);
}
}
@ -1686,32 +1703,42 @@ Readability.prototype = {
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function() {
var nodes = this._doc.getElementsByTagName("p");
if (nodes.length < 5) {
return false;
}
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
var possibleParagraphs = 0;
for (var i = 0; i < nodes.length; i++) {
var node = nodes[i];
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
if (helperIsVisible && !helperIsVisible(node))
return false;
var matchString = node.className + " " + node.id;
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
continue;
return false;
}
if (node.textContent.trim().length < 100) {
continue;
if (node.matches && node.matches("li p")) {
return false;
}
possibleParagraphs++;
if (possibleParagraphs >= 5) {
var textContentLength = node.textContent.trim().length;
if (textContentLength < 140) {
return false;
}
score += Math.sqrt(textContentLength - 140);
if (score > 20) {
return true;
}
}
return false;
return false;
});
},
/**