No bug - update readability from github repo, includes fix for Bug 1173548 and Bug 1255978, r=Gijs

MozReview-Commit-ID: 1Hr9uUqZn5P

--HG--
extra : rebase_source : b248ab70df487b2cff9db390e047b3c9016784af
This commit is contained in:
Evan Tseng 2016-11-28 17:10:20 +08:00
Родитель d0dc500d81
Коммит 286b960470
1 изменённых файлов: 43 добавлений и 6 удалений

Просмотреть файл

@ -119,7 +119,7 @@ Readability.prototype = {
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|legends|menu|modal|related|remark|rss|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@ -471,6 +471,12 @@ Readability.prototype = {
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
this._forEachNode(articleContent.children, function(topCandidate) {
this._cleanMatchedNodes(topCandidate, /share/);
});
// If there is only one h2, they are probably using it as a header
// and not a subheader, so remove it since we already have a header.
if (articleContent.getElementsByTagName('h2').length === 1)
@ -662,9 +668,6 @@ Readability.prototype = {
var pageCacheHtml = page.innerHTML;
// Check if any "dir" is set on the toplevel document element
this._articleDir = doc.documentElement.getAttribute("dir");
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
@ -812,6 +815,7 @@ Readability.prototype = {
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
var parentOfTopCandidate;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
@ -838,7 +842,7 @@ Readability.prototype = {
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
var parentOfTopCandidate = topCandidate.parentNode;
parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
@ -864,7 +868,9 @@ Readability.prototype = {
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblings = topCandidate.parentNode.children;
// Keep potential top candidate's parent node to try to get text direction of it later.
parentOfTopCandidate = topCandidate.parentNode;
var siblings = parentOfTopCandidate.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
@ -968,6 +974,18 @@ Readability.prototype = {
return null;
}
} else {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
if (!ancestor.tagName)
return false;
var articleDir = ancestor.getAttribute("dir");
if (articleDir) {
this._articleDir = articleDir;
return true;
}
return false;
});
return articleContent;
}
}
@ -1695,6 +1713,25 @@ Readability.prototype = {
});
},
/**
* Clean out elements whose id/class combinations match specific string.
*
* @param Element
* @param RegExp match id/class combination.
* @return void
**/
_cleanMatchedNodes: function(e, regex) {
var endOfSearchMarkerNode = this._getNextNode(e, true);
var next = this._getNextNode(e);
while (next && next != endOfSearchMarkerNode) {
if (regex.test(next.className + " " + next.id)) {
next = this._removeAndGetNext(next);
} else {
next = this._getNextNode(next);
}
}
},
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*