No bug - update readability from github repo, includes fix for Bug 1177619, r=Gijs

MozReview-Commit-ID: 5QhYAeW7aOb

--HG--
extra : rebase_source : b61753cb1abfadf8947809abc4b92b148c77d4fc
This commit is contained in:
Evan Tseng 2016-11-01 18:54:06 +08:00
Родитель 321e872bc2
Коммит 8583663005
3 изменённых файлов: 329 добавлений и 111 удалений

Просмотреть файл

@ -0,0 +1,199 @@
"use strict";
module.exports = {
"rules": {
// Braces only needed for multi-line arrow function blocks
// "arrow-body-style": [2, "as-needed"],
// Require spacing around =>
// "arrow-spacing": 2,
// Always require spacing around a single line block
// "block-spacing": 1,
// No newline before open brace for a block
"brace-style": 2,
// No space before always a space after a comma
"comma-spacing": [2, {"before": false, "after": true}],
// Commas at the end of the line not the start
// "comma-style": 2,
// Don't require spaces around computed properties
// "computed-property-spacing": [2, "never"],
// Functions must always return something or nothing
"consistent-return": 2,
// Require braces around blocks that start a new line
// Note that this rule is likely to be overridden on a per-directory basis
// very frequently.
// "curly": [2, "multi-line"],
// Always require a trailing EOL
"eol-last": 2,
// Require function* name()
// "generator-star-spacing": [2, {"before": false, "after": true}],
// Two space indent
"indent": [2, 2, { "SwitchCase": 1 }],
// Space after colon not before in property declarations
"key-spacing": [2, { "beforeColon": false, "afterColon": true, "mode": "minimum" }],
// Unix linebreaks
"linebreak-style": [2, "unix"],
// Always require parenthesis for new calls
"new-parens": 2,
// Use [] instead of Array()
// "no-array-constructor": 2,
// No duplicate arguments in function declarations
"no-dupe-args": 2,
// No duplicate keys in object declarations
"no-dupe-keys": 2,
// No duplicate cases in switch statements
"no-duplicate-case": 2,
// No labels
"no-labels": 2,
// If an if block ends with a return no need for an else block
"no-else-return": 2,
// No empty statements
"no-empty": 2,
// No empty character classes in regex
"no-empty-character-class": 2,
// Disallow empty destructuring
"no-empty-pattern": 2,
// No assiging to exception variable
// "no-ex-assign": 2,
// No using !! where casting to boolean is already happening
// "no-extra-boolean-cast": 2,
// No double semicolon
"no-extra-semi": 2,
// No overwriting defined functions
"no-func-assign": 2,
// Declarations in Program or Function Body
"no-inner-declarations": 2,
// No invalid regular expresions
"no-invalid-regexp": 2,
// No odd whitespace characters
"no-irregular-whitespace": 2,
// No single if block inside an else block
"no-lonely-if": 2,
// No mixing spaces and tabs in indent
"no-mixed-spaces-and-tabs": [2, "smart-tabs"],
// No unnecessary spacing
"no-multi-spaces": [2, { exceptions: { "AssignmentExpression": true, "VariableDeclarator": true, "ArrayExpression": true, "ObjectExpression": true } }],
// No reassigning native JS objects
"no-native-reassign": 2,
// No (!foo in bar)
"no-negated-in-lhs": 2,
// Nested ternary statements are confusing
"no-nested-ternary": 2,
// Use {} instead of new Object()
// "no-new-object": 2,
// No Math() or JSON()
"no-obj-calls": 2,
// No octal literals
"no-octal": 2,
// No redeclaring variables
"no-redeclare": 2,
// No unnecessary comparisons
"no-self-compare": 2,
// No declaring variables from an outer scope
"no-shadow": 2,
// No declaring variables that hide things like arguments
"no-shadow-restricted-names": 2,
// No spaces between function name and parentheses
"no-spaced-func": 2,
// No trailing whitespace
"no-trailing-spaces": 2,
// No using undeclared variables
// "no-undef": 2,
// Error on newline where a semicolon is needed
"no-unexpected-multiline": 2,
// No unreachable statements
"no-unreachable": 2,
// No expressions where a statement is expected
// "no-unused-expressions": 2,
// No declaring variables that are never used
"no-unused-vars": [2, {"vars": "all", "args": "none"}],
// No using variables before defined
// "no-use-before-define": [2, "nofunc"],
// No using with
"no-with": 2,
// Always require semicolon at end of statement
"semi": [2, "always"],
// Require space after keywords
"keyword-spacing": 2,
// Require space before blocks
"space-before-blocks": 2,
// Never use spaces before function parentheses
// "space-before-function-paren": [2, { "anonymous": "always", "named": "never" }],
// Require spaces before finally, catch, etc.
// "space-before-keywords": [2, "always"],
// No space padding in parentheses
// "space-in-parens": [2, "never"],
// Require spaces around operators
// "space-infix-ops": 2,
// Require spaces after return, throw and case
// "space-return-throw-case": 2,
// ++ and -- should not need spacing
// "space-unary-ops": [2, { "words": true, "nonwords": false }],
// No comparisons to NaN
"use-isnan": 2,
// Only check typeof against valid results
"valid-typeof": 2,
},
}

Просмотреть файл

@ -460,16 +460,15 @@
else
this.children.push(newNode);
}
} else {
} else if (oldNode.nodeType === Node.ELEMENT_NODE) {
// new node is not an element node.
// if the old one was, update its element siblings:
if (oldNode.nodeType === Node.ELEMENT_NODE) {
if (oldNode.previousElementSibling)
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
if (oldNode.nextElementSibling)
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
this.children.splice(this.children.indexOf(oldNode), 1);
}
if (oldNode.previousElementSibling)
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
if (oldNode.nextElementSibling)
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
this.children.splice(this.children.indexOf(oldNode), 1);
// If the old node wasn't an element, neither the new nor the old node was an element,
// and the children array and its members shouldn't need any updating.
}
@ -489,8 +488,8 @@
__JSDOMParser__: true,
};
for (var i in nodeTypes) {
Node[i] = Node.prototype[i] = nodeTypes[i];
for (var nodeType in nodeTypes) {
Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
}
var Attribute = function (name, value) {
@ -559,7 +558,7 @@
this._textContent = newText;
delete this._innerHTML;
},
}
};
var Document = function () {
this.styleSheets = [];
@ -829,7 +828,7 @@
Style.prototype.__defineSetter__(jsName, function (value) {
this.setStyle(cssName, value);
});
}) (styleMap[jsName]);
})(styleMap[jsName]);
}
var JSDOMParser = function () {
@ -976,7 +975,7 @@
retPair[0] = node;
retPair[1] = closed;
return true
return true;
},
/**
@ -1193,4 +1192,4 @@
// Attach JSDOMParser to the global scope
global.JSDOMParser = JSDOMParser;
}) (this);
})(this);

Просмотреть файл

@ -26,7 +26,6 @@
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
var root = this;
/**
* Public constructor.
@ -34,7 +33,7 @@ var root = this;
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
var Readability = function(uri, doc, options) {
function Readability(uri, doc, options) {
options = options || {};
this._uri = uri;
@ -83,12 +82,12 @@ var Readability = function(uri, doc, options) {
return rv + elDesc;
};
this.log = function () {
if ("dump" in root) {
if (typeof dump !== undefined) {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
} else if ("console" in root) {
} else if (typeof console !== undefined) {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
@ -122,10 +121,10 @@ Readability.prototype = {
REGEXPS: {
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@ -150,6 +149,28 @@ Readability.prototype = {
this._fixRelativeUris(articleContent);
},
/**
* Iterates over a NodeList, calls `filterFn` for each node and removes node
* if function returned `true`.
*
* If function is not passed, removes all the nodes in node list.
*
* @param NodeList nodeList The no
* @param Function filterFn
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
var parentNode = node.parentNode;
if (parentNode) {
if (!filterFn || filterFn.call(this, node, i, nodeList)) {
parentNode.removeChild(node);
}
}
}
},
/**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
@ -159,10 +180,11 @@ Readability.prototype = {
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @param Boolean backward Whether to use backward iteration.
* @return void
*/
_forEachNode: function(nodeList, fn) {
return Array.prototype.forEach.call(nodeList, fn, this);
_forEachNode: function(nodeList, fn, backward) {
Array.prototype.forEach.call(nodeList, fn, this);
},
/**
@ -283,13 +305,13 @@ Readability.prototype = {
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
} catch(e) {}
} catch (e) {/* ignore exceptions setting the title. */}
if (curTitle.match(/ [\|\-] /)) {
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
if (curTitle.split(' ').length < 3)
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
@ -334,9 +356,7 @@ Readability.prototype = {
var doc = this._doc;
// Remove all style tags in head
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
styleNode.parentNode.removeChild(styleNode);
});
this._removeNodes(doc.getElementsByTagName("style"));
if (doc.body) {
this._replaceBrs(doc.body);
@ -370,7 +390,7 @@ Readability.prototype = {
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs: function (elem) {
this._forEachNode(elem.getElementsByTagName("br"), function(br) {
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
@ -466,7 +486,7 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
var imgCount = paragraph.getElementsByTagName('img').length;
var embedCount = paragraph.getElementsByTagName('embed').length;
var objectCount = paragraph.getElementsByTagName('object').length;
@ -474,11 +494,10 @@ Readability.prototype = {
var iframeCount = paragraph.getElementsByTagName('iframe').length;
var totalCount = imgCount + embedCount + objectCount + iframeCount;
if (totalCount === 0 && !this._getInnerText(paragraph, false))
paragraph.parentNode.removeChild(paragraph);
return totalCount === 0 && !this._getInnerText(paragraph, false);
});
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
var next = this._nextElement(br.nextSibling);
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
@ -495,7 +514,7 @@ Readability.prototype = {
_initializeNode: function(node) {
node.readability = {"contentScore": 0};
switch(node.tagName) {
switch (node.tagName) {
case 'DIV':
node.readability.contentScore += 5;
break;
@ -614,7 +633,7 @@ Readability.prototype = {
maxDepth = maxDepth || 0;
var i = 0, ancestors = [];
while (node.parentNode) {
ancestors.push(node.parentNode)
ancestors.push(node.parentNode);
if (maxDepth && ++i === maxDepth)
break;
node = node.parentNode;
@ -1042,17 +1061,12 @@ Readability.prototype = {
* @param Element
**/
_removeScripts: function(doc) {
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
scriptNode.nodeValue = "";
scriptNode.removeAttribute('src');
if (scriptNode.parentNode)
scriptNode.parentNode.removeChild(scriptNode);
});
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
if (noscriptNode.parentNode)
noscriptNode.parentNode.removeChild(noscriptNode);
return true;
});
this._removeNodes(doc.getElementsByTagName('noscript'));
},
/**
@ -1101,9 +1115,8 @@ Readability.prototype = {
if (normalizeSpaces) {
return textContent.replace(this.REGEXPS.normalize, " ");
} else {
return textContent;
}
return textContent;
},
/**
@ -1113,7 +1126,7 @@ Readability.prototype = {
* @param string - what to split on. Default is ","
* @return number (integer)
**/
_getCharCount: function(e,s) {
_getCharCount: function(e, s) {
s = s || ",";
return this._getInnerText(e).split(s).length - 1;
},
@ -1382,15 +1395,14 @@ Readability.prototype = {
}
}
var nextHref = null;
if (topPage) {
var nextHref = topPage.href.replace(/\/$/,'');
nextHref = topPage.href.replace(/\/$/, '');
this.log('NEXT PAGE IS ' + nextHref);
this._parsedPages[nextHref] = true;
return nextHref;
} else {
return null;
}
return nextHref;
},
_successfulRequest: function(request) {
@ -1407,9 +1419,8 @@ Readability.prototype = {
if (this._successfulRequest(request)) {
if (options.success)
options.success(request);
} else {
if (options.error)
options.error(request);
} else if (options.error) {
options.error(request);
}
}
}
@ -1462,9 +1473,8 @@ Readability.prototype = {
this.log("Exact duplicate page found via ETag. Aborting.");
articlePage.style.display = 'none';
return;
} else {
this._pageETags[eTag] = 1;
}
this._pageETags[eTag] = 1;
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
@ -1478,9 +1488,9 @@ Readability.prototype = {
// - Turn all double br's into p's - was handled by prepDocument in the original view.
// Maybe in the future abstract out prepDocument to work for both the original document
// and AJAX-added pages.
var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
page.innerHTML = responseHtml;
@ -1490,7 +1500,7 @@ Readability.prototype = {
// disable as necessary at the end of grabArticle.
this._flags = 0x1 | 0x2 | 0x4;
var nextPageLink = this._findNextPageLink(page);
var secondNextPageLink = this._findNextPageLink(page);
// NOTE: if we end up supporting _appendNextPage(), we'll need to
// change this call to be async
@ -1529,8 +1539,8 @@ Readability.prototype = {
}).bind(this), 500);
if (nextPageLink)
this._appendNextPage(nextPageLink);
if (secondNextPageLink)
this._appendNextPage(secondNextPageLink);
}
});
}).bind(this)(nextPageLink, articlePage);
@ -1581,7 +1591,7 @@ Readability.prototype = {
_clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
this._forEachNode(e.getElementsByTagName(tag), function(element) {
this._removeNodes(e.getElementsByTagName(tag), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) {
var attributeValues = [].map.call(element.attributes, function(attr) {
@ -1590,14 +1600,14 @@ Readability.prototype = {
// First, check the elements attributes to see if any of them contain youtube or vimeo
if (this.REGEXPS.videos.test(attributeValues))
return;
return false;
// Then check the elements inside this element for the same.
if (this.REGEXPS.videos.test(element.innerHTML))
return;
return false;
}
element.parentNode.removeChild(element);
return true;
});
},
@ -1634,8 +1644,6 @@ Readability.prototype = {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
@ -1643,54 +1651,48 @@ Readability.prototype = {
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
for (var i = curTagsLength-1; i >= 0; i -= 1) {
var weight = this._getClassWeight(tagsList[i]);
this._removeNodes(e.getElementsByTagName(tag), function(node) {
var weight = this._getClassWeight(node);
var contentScore = 0;
this.log("Cleaning Conditionally", tagsList[i]);
this.log("Cleaning Conditionally", node);
if (weight + contentScore < 0) {
tagsList[i].parentNode.removeChild(tagsList[i]);
} else if (this._getCharCount(tagsList[i],',') < 10) {
return true;
}
if (this._getCharCount(node, ',') < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
var p = tagsList[i].getElementsByTagName("p").length;
var img = tagsList[i].getElementsByTagName("img").length;
var li = tagsList[i].getElementsByTagName("li").length-100;
var input = tagsList[i].getElementsByTagName("input").length;
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length-100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed");
var embeds = node.getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
var linkDensity = this._getLinkDensity(tagsList[i]);
var contentLength = this._getInnerText(tagsList[i]).length;
var toRemove = false;
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
} else if (!isList && li > p) {
toRemove = true;
} else if (input > Math.floor(p/3)) {
toRemove = true;
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
toRemove = true;
} else if (!isList && weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > 0.5) {
toRemove = true;
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
var linkDensity = this._getLinkDensity(node);
var contentLength = this._getInnerText(node).length;
if (toRemove) {
tagsList[i].parentNode.removeChild(tagsList[i]);
}
var haveToRemove =
// Make an exception for elements with no p's and exactly 1 img.
(img > p && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
}
return false;
});
},
/**
@ -1701,11 +1703,9 @@ Readability.prototype = {
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i = headers.length - 1; i >= 0; i -= 1) {
if (this._getClassWeight(headers[i]) < 0)
headers[i].parentNode.removeChild(headers[i]);
}
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
return this._getClassWeight(header) < 0;
});
}
},
@ -1729,6 +1729,22 @@ Readability.prototype = {
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
// Some articles' DOM structures might look like
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
if (brNodes.length) {
var set = new Set();
[].forEach.call(brNodes, function(node) {
set.add(node.parentNode);
});
nodes = [].concat.apply(Array.from(set), nodes);
}
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
@ -1832,12 +1848,16 @@ Readability.prototype = {
}
}
return { uri: this._uri,
title: articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
length: articleContent.textContent.length,
excerpt: metadata.excerpt };
var textContent = articleContent.textContent;
return {
uri: this._uri,
title: articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
};
}
};