Bug 782285 - Don't offer reader in pages with too much reading competition (r=mfinkle)

* * *
blo
This commit is contained in:
Lucas Rocha 2012-08-23 17:08:13 +01:00
Родитель 07f3bf083b
Коммит 4a98b7cf20
1 изменённых файлов: 35 добавлений и 10 удалений

Просмотреть файл

@ -58,6 +58,10 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2, FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4, FLAG_CLEAN_CONDITIONALLY: 0x4,
// The number of top candidates to consider when analysing how
// tight the competition is among candidates.
N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call // The maximum number of pages to loop through before we call
// it quits and just show a link. // it quits and just show a link.
MAX_PAGES: 5, MAX_PAGES: 5,
@ -535,23 +539,33 @@ Readability.prototype = {
// After we've calculated scores, loop through all of the possible // After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score. // candidate nodes we found and find the one with the highest score.
let topCandidate = null; let topCandidates = [];
for (let c = 0, cl = candidates.length; c < cl; c += 1) { for (let c = 0, cl = candidates.length; c < cl; c += 1) {
let candidate = candidates[c];
// Scale the final candidates score based on link density. Good content // Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly // should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation. // unaffected by this operation.
candidates[c].readability.contentScore = let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
candidates[c].readability.contentScore * (1 - this._getLinkDensity(candidates[c])); candidate.readability.contentScore = candidateScore;
this.log('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + this.log('Candidate: ' + candidate + " (" + candidate.className + ":" +
candidates[c].id + ") with score " + candidate.id + ") with score " + candidateScore);
candidates[c].readability.contentScore);
if (!topCandidate || for (let t = 0; t < this.N_TOP_CANDIDATES; t++) {
candidates[c].readability.contentScore > topCandidate.readability.contentScore) { let aTopCandidate = topCandidates[t];
topCandidate = candidates[c];
if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
topCandidates.splice(t, 0, candidate);
if (topCandidates.length > this.N_TOP_CANDIDATES)
topCandidates.pop();
break;
} }
} }
}
let topCandidate = topCandidates[0] || null;
let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null);
// If we still have no top candidate, just use the body as a last resort. // If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify. // We also have to copy the body node so it is something we can modify.
@ -647,7 +661,7 @@ Readability.prototype = {
// grabArticle with different flags set. This gives us a higher likelihood of // grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content. // finding the -right- content.
if (this._getInnerText(articleContent, false).length < 250) { if (this._getInnerText(articleContent, true).length < 500) {
page.innerHTML = pageCacheHtml; page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
@ -660,6 +674,17 @@ Readability.prototype = {
return null; return null;
} }
} else { } else {
if (lastTopCandidate !== null) {
// EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the
// readability algorithm. This is to avoid offering reader mode on pages that are more like
// a list or directory of links with summaries. It takes the score of the last top candidate
// (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not
// actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio).
let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore;
if (contrastRatio > 0.45)
return null;
}
return articleContent; return articleContent;
} }
} }