зеркало из https://github.com/mozilla/gecko-dev.git
No bug - update readability from github repo, includes fix for Bug 1173548 and Bug 1255978, r=Gijs
MozReview-Commit-ID: 1Hr9uUqZn5P --HG-- extra : rebase_source : b248ab70df487b2cff9db390e047b3c9016784af
This commit is contained in:
Родитель
d0dc500d81
Коммит
286b960470
|
@ -119,7 +119,7 @@ Readability.prototype = {
|
|||
// All of the regular expressions in use within readability.
|
||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||
REGEXPS: {
|
||||
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|legends|menu|modal|related|remark|rss|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
||||
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
|
||||
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
||||
|
@ -471,6 +471,12 @@ Readability.prototype = {
|
|||
this._clean(articleContent, "h1");
|
||||
this._clean(articleContent, "footer");
|
||||
|
||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
||||
// which means we don't remove the top candidates even they have "share".
|
||||
this._forEachNode(articleContent.children, function(topCandidate) {
|
||||
this._cleanMatchedNodes(topCandidate, /share/);
|
||||
});
|
||||
|
||||
// If there is only one h2, they are probably using it as a header
|
||||
// and not a subheader, so remove it since we already have a header.
|
||||
if (articleContent.getElementsByTagName('h2').length === 1)
|
||||
|
@ -662,9 +668,6 @@ Readability.prototype = {
|
|||
|
||||
var pageCacheHtml = page.innerHTML;
|
||||
|
||||
// Check if any "dir" is set on the toplevel document element
|
||||
this._articleDir = doc.documentElement.getAttribute("dir");
|
||||
|
||||
while (true) {
|
||||
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||
|
||||
|
@ -812,6 +815,7 @@ Readability.prototype = {
|
|||
|
||||
var topCandidate = topCandidates[0] || null;
|
||||
var neededToCreateTopCandidate = false;
|
||||
var parentOfTopCandidate;
|
||||
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
// We also have to copy the body node so it is something we can modify.
|
||||
|
@ -838,7 +842,7 @@ Readability.prototype = {
|
|||
// lurking in other places that we want to unify in. The sibling stuff
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
var parentOfTopCandidate = topCandidate.parentNode;
|
||||
parentOfTopCandidate = topCandidate.parentNode;
|
||||
var lastScore = topCandidate.readability.contentScore;
|
||||
// The scores shouldn't get too low.
|
||||
var scoreThreshold = lastScore / 3;
|
||||
|
@ -864,7 +868,9 @@ Readability.prototype = {
|
|||
articleContent.id = "readability-content";
|
||||
|
||||
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
|
||||
var siblings = topCandidate.parentNode.children;
|
||||
// Keep potential top candidate's parent node to try to get text direction of it later.
|
||||
parentOfTopCandidate = topCandidate.parentNode;
|
||||
var siblings = parentOfTopCandidate.children;
|
||||
|
||||
for (var s = 0, sl = siblings.length; s < sl; s++) {
|
||||
var sibling = siblings[s];
|
||||
|
@ -968,6 +974,18 @@ Readability.prototype = {
|
|||
return null;
|
||||
}
|
||||
} else {
|
||||
// Find out text direction from ancestors of final top candidate.
|
||||
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
|
||||
this._someNode(ancestors, function(ancestor) {
|
||||
if (!ancestor.tagName)
|
||||
return false;
|
||||
var articleDir = ancestor.getAttribute("dir");
|
||||
if (articleDir) {
|
||||
this._articleDir = articleDir;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
return articleContent;
|
||||
}
|
||||
}
|
||||
|
@ -1695,6 +1713,25 @@ Readability.prototype = {
|
|||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean out elements whose id/class combinations match specific string.
|
||||
*
|
||||
* @param Element
|
||||
* @param RegExp match id/class combination.
|
||||
* @return void
|
||||
**/
|
||||
_cleanMatchedNodes: function(e, regex) {
|
||||
var endOfSearchMarkerNode = this._getNextNode(e, true);
|
||||
var next = this._getNextNode(e);
|
||||
while (next && next != endOfSearchMarkerNode) {
|
||||
if (regex.test(next.className + " " + next.id)) {
|
||||
next = this._removeAndGetNext(next);
|
||||
} else {
|
||||
next = this._getNextNode(next);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean out spurious headers from an Element. Checks things like classnames and link density.
|
||||
*
|
||||
|
|
Загрузка…
Ссылка в новой задаче