зеркало из https://github.com/mozilla/gecko-dev.git
No bug - update readability from github repo, includes fix for Bug 1177619, r=Gijs
MozReview-Commit-ID: 5QhYAeW7aOb --HG-- extra : rebase_source : b61753cb1abfadf8947809abc4b92b148c77d4fc
This commit is contained in:
Родитель
321e872bc2
Коммит
8583663005
|
@ -0,0 +1,199 @@
|
|||
"use strict";
|
||||
|
||||
module.exports = {
|
||||
"rules": {
|
||||
// Braces only needed for multi-line arrow function blocks
|
||||
// "arrow-body-style": [2, "as-needed"],
|
||||
|
||||
// Require spacing around =>
|
||||
// "arrow-spacing": 2,
|
||||
|
||||
// Always require spacing around a single line block
|
||||
// "block-spacing": 1,
|
||||
|
||||
// No newline before open brace for a block
|
||||
"brace-style": 2,
|
||||
|
||||
// No space before always a space after a comma
|
||||
"comma-spacing": [2, {"before": false, "after": true}],
|
||||
|
||||
// Commas at the end of the line not the start
|
||||
// "comma-style": 2,
|
||||
|
||||
// Don't require spaces around computed properties
|
||||
// "computed-property-spacing": [2, "never"],
|
||||
|
||||
// Functions must always return something or nothing
|
||||
"consistent-return": 2,
|
||||
|
||||
// Require braces around blocks that start a new line
|
||||
// Note that this rule is likely to be overridden on a per-directory basis
|
||||
// very frequently.
|
||||
// "curly": [2, "multi-line"],
|
||||
|
||||
// Always require a trailing EOL
|
||||
"eol-last": 2,
|
||||
|
||||
// Require function* name()
|
||||
// "generator-star-spacing": [2, {"before": false, "after": true}],
|
||||
|
||||
// Two space indent
|
||||
"indent": [2, 2, { "SwitchCase": 1 }],
|
||||
|
||||
// Space after colon not before in property declarations
|
||||
"key-spacing": [2, { "beforeColon": false, "afterColon": true, "mode": "minimum" }],
|
||||
|
||||
// Unix linebreaks
|
||||
"linebreak-style": [2, "unix"],
|
||||
|
||||
// Always require parenthesis for new calls
|
||||
"new-parens": 2,
|
||||
|
||||
// Use [] instead of Array()
|
||||
// "no-array-constructor": 2,
|
||||
|
||||
// No duplicate arguments in function declarations
|
||||
"no-dupe-args": 2,
|
||||
|
||||
// No duplicate keys in object declarations
|
||||
"no-dupe-keys": 2,
|
||||
|
||||
// No duplicate cases in switch statements
|
||||
"no-duplicate-case": 2,
|
||||
|
||||
// No labels
|
||||
"no-labels": 2,
|
||||
|
||||
// If an if block ends with a return no need for an else block
|
||||
"no-else-return": 2,
|
||||
|
||||
// No empty statements
|
||||
"no-empty": 2,
|
||||
|
||||
// No empty character classes in regex
|
||||
"no-empty-character-class": 2,
|
||||
|
||||
// Disallow empty destructuring
|
||||
"no-empty-pattern": 2,
|
||||
|
||||
// No assiging to exception variable
|
||||
// "no-ex-assign": 2,
|
||||
|
||||
// No using !! where casting to boolean is already happening
|
||||
// "no-extra-boolean-cast": 2,
|
||||
|
||||
// No double semicolon
|
||||
"no-extra-semi": 2,
|
||||
|
||||
// No overwriting defined functions
|
||||
"no-func-assign": 2,
|
||||
|
||||
// Declarations in Program or Function Body
|
||||
"no-inner-declarations": 2,
|
||||
|
||||
// No invalid regular expresions
|
||||
"no-invalid-regexp": 2,
|
||||
|
||||
// No odd whitespace characters
|
||||
"no-irregular-whitespace": 2,
|
||||
|
||||
// No single if block inside an else block
|
||||
"no-lonely-if": 2,
|
||||
|
||||
// No mixing spaces and tabs in indent
|
||||
"no-mixed-spaces-and-tabs": [2, "smart-tabs"],
|
||||
|
||||
// No unnecessary spacing
|
||||
"no-multi-spaces": [2, { exceptions: { "AssignmentExpression": true, "VariableDeclarator": true, "ArrayExpression": true, "ObjectExpression": true } }],
|
||||
|
||||
// No reassigning native JS objects
|
||||
"no-native-reassign": 2,
|
||||
|
||||
// No (!foo in bar)
|
||||
"no-negated-in-lhs": 2,
|
||||
|
||||
// Nested ternary statements are confusing
|
||||
"no-nested-ternary": 2,
|
||||
|
||||
// Use {} instead of new Object()
|
||||
// "no-new-object": 2,
|
||||
|
||||
// No Math() or JSON()
|
||||
"no-obj-calls": 2,
|
||||
|
||||
// No octal literals
|
||||
"no-octal": 2,
|
||||
|
||||
// No redeclaring variables
|
||||
"no-redeclare": 2,
|
||||
|
||||
// No unnecessary comparisons
|
||||
"no-self-compare": 2,
|
||||
|
||||
// No declaring variables from an outer scope
|
||||
"no-shadow": 2,
|
||||
|
||||
// No declaring variables that hide things like arguments
|
||||
"no-shadow-restricted-names": 2,
|
||||
|
||||
// No spaces between function name and parentheses
|
||||
"no-spaced-func": 2,
|
||||
|
||||
// No trailing whitespace
|
||||
"no-trailing-spaces": 2,
|
||||
|
||||
// No using undeclared variables
|
||||
// "no-undef": 2,
|
||||
|
||||
// Error on newline where a semicolon is needed
|
||||
"no-unexpected-multiline": 2,
|
||||
|
||||
// No unreachable statements
|
||||
"no-unreachable": 2,
|
||||
|
||||
// No expressions where a statement is expected
|
||||
// "no-unused-expressions": 2,
|
||||
|
||||
// No declaring variables that are never used
|
||||
"no-unused-vars": [2, {"vars": "all", "args": "none"}],
|
||||
|
||||
// No using variables before defined
|
||||
// "no-use-before-define": [2, "nofunc"],
|
||||
|
||||
// No using with
|
||||
"no-with": 2,
|
||||
|
||||
// Always require semicolon at end of statement
|
||||
"semi": [2, "always"],
|
||||
|
||||
// Require space after keywords
|
||||
"keyword-spacing": 2,
|
||||
|
||||
// Require space before blocks
|
||||
"space-before-blocks": 2,
|
||||
|
||||
// Never use spaces before function parentheses
|
||||
// "space-before-function-paren": [2, { "anonymous": "always", "named": "never" }],
|
||||
|
||||
// Require spaces before finally, catch, etc.
|
||||
// "space-before-keywords": [2, "always"],
|
||||
|
||||
// No space padding in parentheses
|
||||
// "space-in-parens": [2, "never"],
|
||||
|
||||
// Require spaces around operators
|
||||
// "space-infix-ops": 2,
|
||||
|
||||
// Require spaces after return, throw and case
|
||||
// "space-return-throw-case": 2,
|
||||
|
||||
// ++ and -- should not need spacing
|
||||
// "space-unary-ops": [2, { "words": true, "nonwords": false }],
|
||||
|
||||
// No comparisons to NaN
|
||||
"use-isnan": 2,
|
||||
|
||||
// Only check typeof against valid results
|
||||
"valid-typeof": 2,
|
||||
},
|
||||
}
|
|
@ -460,16 +460,15 @@
|
|||
else
|
||||
this.children.push(newNode);
|
||||
}
|
||||
} else {
|
||||
} else if (oldNode.nodeType === Node.ELEMENT_NODE) {
|
||||
// new node is not an element node.
|
||||
// if the old one was, update its element siblings:
|
||||
if (oldNode.nodeType === Node.ELEMENT_NODE) {
|
||||
if (oldNode.previousElementSibling)
|
||||
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
|
||||
if (oldNode.nextElementSibling)
|
||||
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
|
||||
this.children.splice(this.children.indexOf(oldNode), 1);
|
||||
}
|
||||
if (oldNode.previousElementSibling)
|
||||
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
|
||||
if (oldNode.nextElementSibling)
|
||||
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
|
||||
this.children.splice(this.children.indexOf(oldNode), 1);
|
||||
|
||||
// If the old node wasn't an element, neither the new nor the old node was an element,
|
||||
// and the children array and its members shouldn't need any updating.
|
||||
}
|
||||
|
@ -489,8 +488,8 @@
|
|||
__JSDOMParser__: true,
|
||||
};
|
||||
|
||||
for (var i in nodeTypes) {
|
||||
Node[i] = Node.prototype[i] = nodeTypes[i];
|
||||
for (var nodeType in nodeTypes) {
|
||||
Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
|
||||
}
|
||||
|
||||
var Attribute = function (name, value) {
|
||||
|
@ -559,7 +558,7 @@
|
|||
this._textContent = newText;
|
||||
delete this._innerHTML;
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
var Document = function () {
|
||||
this.styleSheets = [];
|
||||
|
@ -829,7 +828,7 @@
|
|||
Style.prototype.__defineSetter__(jsName, function (value) {
|
||||
this.setStyle(cssName, value);
|
||||
});
|
||||
}) (styleMap[jsName]);
|
||||
})(styleMap[jsName]);
|
||||
}
|
||||
|
||||
var JSDOMParser = function () {
|
||||
|
@ -976,7 +975,7 @@
|
|||
|
||||
retPair[0] = node;
|
||||
retPair[1] = closed;
|
||||
return true
|
||||
return true;
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -1193,4 +1192,4 @@
|
|||
// Attach JSDOMParser to the global scope
|
||||
global.JSDOMParser = JSDOMParser;
|
||||
|
||||
}) (this);
|
||||
})(this);
|
||||
|
|
|
@ -26,7 +26,6 @@
|
|||
* This code is heavily based on Arc90's readability.js (1.7.1) script
|
||||
* available at: http://code.google.com/p/arc90labs-readability
|
||||
*/
|
||||
var root = this;
|
||||
|
||||
/**
|
||||
* Public constructor.
|
||||
|
@ -34,7 +33,7 @@ var root = this;
|
|||
* @param {HTMLDocument} doc The document to parse.
|
||||
* @param {Object} options The options object.
|
||||
*/
|
||||
var Readability = function(uri, doc, options) {
|
||||
function Readability(uri, doc, options) {
|
||||
options = options || {};
|
||||
|
||||
this._uri = uri;
|
||||
|
@ -83,12 +82,12 @@ var Readability = function(uri, doc, options) {
|
|||
return rv + elDesc;
|
||||
};
|
||||
this.log = function () {
|
||||
if ("dump" in root) {
|
||||
if (typeof dump !== undefined) {
|
||||
var msg = Array.prototype.map.call(arguments, function(x) {
|
||||
return (x && x.nodeName) ? logEl(x) : x;
|
||||
}).join(" ");
|
||||
dump("Reader: (Readability) " + msg + "\n");
|
||||
} else if ("console" in root) {
|
||||
} else if (typeof console !== undefined) {
|
||||
var args = ["Reader: (Readability) "].concat(arguments);
|
||||
console.log.apply(console, args);
|
||||
}
|
||||
|
@ -122,10 +121,10 @@ Readability.prototype = {
|
|||
REGEXPS: {
|
||||
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
||||
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
|
||||
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
||||
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
|
||||
byline: /byline|author|dateline|writtenby/i,
|
||||
byline: /byline|author|dateline|writtenby|p-author/i,
|
||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||
normalize: /\s{2,}/g,
|
||||
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||
|
@ -150,6 +149,28 @@ Readability.prototype = {
|
|||
this._fixRelativeUris(articleContent);
|
||||
},
|
||||
|
||||
/**
|
||||
* Iterates over a NodeList, calls `filterFn` for each node and removes node
|
||||
* if function returned `true`.
|
||||
*
|
||||
* If function is not passed, removes all the nodes in node list.
|
||||
*
|
||||
* @param NodeList nodeList The no
|
||||
* @param Function filterFn
|
||||
* @return void
|
||||
*/
|
||||
_removeNodes: function(nodeList, filterFn) {
|
||||
for (var i = nodeList.length - 1; i >= 0; i--) {
|
||||
var node = nodeList[i];
|
||||
var parentNode = node.parentNode;
|
||||
if (parentNode) {
|
||||
if (!filterFn || filterFn.call(this, node, i, nodeList)) {
|
||||
parentNode.removeChild(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* Iterate over a NodeList, which doesn't natively fully implement the Array
|
||||
* interface.
|
||||
|
@ -159,10 +180,11 @@ Readability.prototype = {
|
|||
*
|
||||
* @param NodeList nodeList The NodeList.
|
||||
* @param Function fn The iterate function.
|
||||
* @param Boolean backward Whether to use backward iteration.
|
||||
* @return void
|
||||
*/
|
||||
_forEachNode: function(nodeList, fn) {
|
||||
return Array.prototype.forEach.call(nodeList, fn, this);
|
||||
_forEachNode: function(nodeList, fn, backward) {
|
||||
Array.prototype.forEach.call(nodeList, fn, this);
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -283,13 +305,13 @@ Readability.prototype = {
|
|||
// If they had an element with id "title" in their HTML
|
||||
if (typeof curTitle !== "string")
|
||||
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
|
||||
} catch(e) {}
|
||||
} catch (e) {/* ignore exceptions setting the title. */}
|
||||
|
||||
if (curTitle.match(/ [\|\-] /)) {
|
||||
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
|
||||
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
|
||||
|
||||
if (curTitle.split(' ').length < 3)
|
||||
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
|
||||
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
|
||||
} else if (curTitle.indexOf(': ') !== -1) {
|
||||
// Check if we have an heading containing this exact string, so we
|
||||
// could assume it's the full title.
|
||||
|
@ -334,9 +356,7 @@ Readability.prototype = {
|
|||
var doc = this._doc;
|
||||
|
||||
// Remove all style tags in head
|
||||
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
|
||||
styleNode.parentNode.removeChild(styleNode);
|
||||
});
|
||||
this._removeNodes(doc.getElementsByTagName("style"));
|
||||
|
||||
if (doc.body) {
|
||||
this._replaceBrs(doc.body);
|
||||
|
@ -370,7 +390,7 @@ Readability.prototype = {
|
|||
* <div>foo<br>bar<p>abc</p></div>
|
||||
*/
|
||||
_replaceBrs: function (elem) {
|
||||
this._forEachNode(elem.getElementsByTagName("br"), function(br) {
|
||||
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
|
||||
var next = br.nextSibling;
|
||||
|
||||
// Whether 2 or more <br> elements have been found and replaced with a
|
||||
|
@ -466,7 +486,7 @@ Readability.prototype = {
|
|||
this._cleanConditionally(articleContent, "div");
|
||||
|
||||
// Remove extra paragraphs
|
||||
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
|
||||
this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
|
||||
var imgCount = paragraph.getElementsByTagName('img').length;
|
||||
var embedCount = paragraph.getElementsByTagName('embed').length;
|
||||
var objectCount = paragraph.getElementsByTagName('object').length;
|
||||
|
@ -474,11 +494,10 @@ Readability.prototype = {
|
|||
var iframeCount = paragraph.getElementsByTagName('iframe').length;
|
||||
var totalCount = imgCount + embedCount + objectCount + iframeCount;
|
||||
|
||||
if (totalCount === 0 && !this._getInnerText(paragraph, false))
|
||||
paragraph.parentNode.removeChild(paragraph);
|
||||
return totalCount === 0 && !this._getInnerText(paragraph, false);
|
||||
});
|
||||
|
||||
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
|
||||
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
|
||||
var next = this._nextElement(br.nextSibling);
|
||||
if (next && next.tagName == "P")
|
||||
br.parentNode.removeChild(br);
|
||||
|
@ -495,7 +514,7 @@ Readability.prototype = {
|
|||
_initializeNode: function(node) {
|
||||
node.readability = {"contentScore": 0};
|
||||
|
||||
switch(node.tagName) {
|
||||
switch (node.tagName) {
|
||||
case 'DIV':
|
||||
node.readability.contentScore += 5;
|
||||
break;
|
||||
|
@ -614,7 +633,7 @@ Readability.prototype = {
|
|||
maxDepth = maxDepth || 0;
|
||||
var i = 0, ancestors = [];
|
||||
while (node.parentNode) {
|
||||
ancestors.push(node.parentNode)
|
||||
ancestors.push(node.parentNode);
|
||||
if (maxDepth && ++i === maxDepth)
|
||||
break;
|
||||
node = node.parentNode;
|
||||
|
@ -1042,17 +1061,12 @@ Readability.prototype = {
|
|||
* @param Element
|
||||
**/
|
||||
_removeScripts: function(doc) {
|
||||
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
|
||||
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
|
||||
scriptNode.nodeValue = "";
|
||||
scriptNode.removeAttribute('src');
|
||||
|
||||
if (scriptNode.parentNode)
|
||||
scriptNode.parentNode.removeChild(scriptNode);
|
||||
});
|
||||
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
|
||||
if (noscriptNode.parentNode)
|
||||
noscriptNode.parentNode.removeChild(noscriptNode);
|
||||
return true;
|
||||
});
|
||||
this._removeNodes(doc.getElementsByTagName('noscript'));
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -1101,9 +1115,8 @@ Readability.prototype = {
|
|||
|
||||
if (normalizeSpaces) {
|
||||
return textContent.replace(this.REGEXPS.normalize, " ");
|
||||
} else {
|
||||
return textContent;
|
||||
}
|
||||
return textContent;
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -1113,7 +1126,7 @@ Readability.prototype = {
|
|||
* @param string - what to split on. Default is ","
|
||||
* @return number (integer)
|
||||
**/
|
||||
_getCharCount: function(e,s) {
|
||||
_getCharCount: function(e, s) {
|
||||
s = s || ",";
|
||||
return this._getInnerText(e).split(s).length - 1;
|
||||
},
|
||||
|
@ -1382,15 +1395,14 @@ Readability.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
var nextHref = null;
|
||||
if (topPage) {
|
||||
var nextHref = topPage.href.replace(/\/$/,'');
|
||||
nextHref = topPage.href.replace(/\/$/, '');
|
||||
|
||||
this.log('NEXT PAGE IS ' + nextHref);
|
||||
this._parsedPages[nextHref] = true;
|
||||
return nextHref;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return nextHref;
|
||||
},
|
||||
|
||||
_successfulRequest: function(request) {
|
||||
|
@ -1407,9 +1419,8 @@ Readability.prototype = {
|
|||
if (this._successfulRequest(request)) {
|
||||
if (options.success)
|
||||
options.success(request);
|
||||
} else {
|
||||
if (options.error)
|
||||
options.error(request);
|
||||
} else if (options.error) {
|
||||
options.error(request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1462,9 +1473,8 @@ Readability.prototype = {
|
|||
this.log("Exact duplicate page found via ETag. Aborting.");
|
||||
articlePage.style.display = 'none';
|
||||
return;
|
||||
} else {
|
||||
this._pageETags[eTag] = 1;
|
||||
}
|
||||
this._pageETags[eTag] = 1;
|
||||
}
|
||||
|
||||
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
|
||||
|
@ -1478,9 +1488,9 @@ Readability.prototype = {
|
|||
// - Turn all double br's into p's - was handled by prepDocument in the original view.
|
||||
// Maybe in the future abstract out prepDocument to work for both the original document
|
||||
// and AJAX-added pages.
|
||||
var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
|
||||
var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
|
||||
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
|
||||
|
||||
page.innerHTML = responseHtml;
|
||||
|
@ -1490,7 +1500,7 @@ Readability.prototype = {
|
|||
// disable as necessary at the end of grabArticle.
|
||||
this._flags = 0x1 | 0x2 | 0x4;
|
||||
|
||||
var nextPageLink = this._findNextPageLink(page);
|
||||
var secondNextPageLink = this._findNextPageLink(page);
|
||||
|
||||
// NOTE: if we end up supporting _appendNextPage(), we'll need to
|
||||
// change this call to be async
|
||||
|
@ -1529,8 +1539,8 @@ Readability.prototype = {
|
|||
}).bind(this), 500);
|
||||
|
||||
|
||||
if (nextPageLink)
|
||||
this._appendNextPage(nextPageLink);
|
||||
if (secondNextPageLink)
|
||||
this._appendNextPage(secondNextPageLink);
|
||||
}
|
||||
});
|
||||
}).bind(this)(nextPageLink, articlePage);
|
||||
|
@ -1581,7 +1591,7 @@ Readability.prototype = {
|
|||
_clean: function(e, tag) {
|
||||
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
||||
|
||||
this._forEachNode(e.getElementsByTagName(tag), function(element) {
|
||||
this._removeNodes(e.getElementsByTagName(tag), function(element) {
|
||||
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||
if (isEmbed) {
|
||||
var attributeValues = [].map.call(element.attributes, function(attr) {
|
||||
|
@ -1590,14 +1600,14 @@ Readability.prototype = {
|
|||
|
||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||
if (this.REGEXPS.videos.test(attributeValues))
|
||||
return;
|
||||
return false;
|
||||
|
||||
// Then check the elements inside this element for the same.
|
||||
if (this.REGEXPS.videos.test(element.innerHTML))
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
element.parentNode.removeChild(element);
|
||||
return true;
|
||||
});
|
||||
},
|
||||
|
||||
|
@ -1634,8 +1644,6 @@ Readability.prototype = {
|
|||
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
|
||||
return;
|
||||
|
||||
var tagsList = e.getElementsByTagName(tag);
|
||||
var curTagsLength = tagsList.length;
|
||||
var isList = tag === "ul" || tag === "ol";
|
||||
|
||||
// Gather counts for other typical elements embedded within.
|
||||
|
@ -1643,54 +1651,48 @@ Readability.prototype = {
|
|||
// without effecting the traversal.
|
||||
//
|
||||
// TODO: Consider taking into account original contentScore here.
|
||||
for (var i = curTagsLength-1; i >= 0; i -= 1) {
|
||||
var weight = this._getClassWeight(tagsList[i]);
|
||||
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
||||
var weight = this._getClassWeight(node);
|
||||
var contentScore = 0;
|
||||
|
||||
this.log("Cleaning Conditionally", tagsList[i]);
|
||||
this.log("Cleaning Conditionally", node);
|
||||
|
||||
if (weight + contentScore < 0) {
|
||||
tagsList[i].parentNode.removeChild(tagsList[i]);
|
||||
} else if (this._getCharCount(tagsList[i],',') < 10) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this._getCharCount(node, ',') < 10) {
|
||||
// If there are not very many commas, and the number of
|
||||
// non-paragraph elements is more than paragraphs or other
|
||||
// ominous signs, remove the element.
|
||||
var p = tagsList[i].getElementsByTagName("p").length;
|
||||
var img = tagsList[i].getElementsByTagName("img").length;
|
||||
var li = tagsList[i].getElementsByTagName("li").length-100;
|
||||
var input = tagsList[i].getElementsByTagName("input").length;
|
||||
var p = node.getElementsByTagName("p").length;
|
||||
var img = node.getElementsByTagName("img").length;
|
||||
var li = node.getElementsByTagName("li").length-100;
|
||||
var input = node.getElementsByTagName("input").length;
|
||||
|
||||
var embedCount = 0;
|
||||
var embeds = tagsList[i].getElementsByTagName("embed");
|
||||
var embeds = node.getElementsByTagName("embed");
|
||||
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
||||
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
||||
embedCount += 1;
|
||||
}
|
||||
|
||||
var linkDensity = this._getLinkDensity(tagsList[i]);
|
||||
var contentLength = this._getInnerText(tagsList[i]).length;
|
||||
var toRemove = false;
|
||||
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
|
||||
toRemove = true;
|
||||
} else if (!isList && li > p) {
|
||||
toRemove = true;
|
||||
} else if (input > Math.floor(p/3)) {
|
||||
toRemove = true;
|
||||
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
|
||||
toRemove = true;
|
||||
} else if (!isList && weight < 25 && linkDensity > 0.2) {
|
||||
toRemove = true;
|
||||
} else if (weight >= 25 && linkDensity > 0.5) {
|
||||
toRemove = true;
|
||||
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
|
||||
toRemove = true;
|
||||
}
|
||||
var linkDensity = this._getLinkDensity(node);
|
||||
var contentLength = this._getInnerText(node).length;
|
||||
|
||||
if (toRemove) {
|
||||
tagsList[i].parentNode.removeChild(tagsList[i]);
|
||||
}
|
||||
var haveToRemove =
|
||||
// Make an exception for elements with no p's and exactly 1 img.
|
||||
(img > p && !this._hasAncestorTag(node, "figure")) ||
|
||||
(!isList && li > p) ||
|
||||
(input > Math.floor(p/3)) ||
|
||||
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
|
||||
(!isList && weight < 25 && linkDensity > 0.2) ||
|
||||
(weight >= 25 && linkDensity > 0.5) ||
|
||||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
|
||||
return haveToRemove;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -1701,11 +1703,9 @@ Readability.prototype = {
|
|||
**/
|
||||
_cleanHeaders: function(e) {
|
||||
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
||||
var headers = e.getElementsByTagName('h' + headerIndex);
|
||||
for (var i = headers.length - 1; i >= 0; i -= 1) {
|
||||
if (this._getClassWeight(headers[i]) < 0)
|
||||
headers[i].parentNode.removeChild(headers[i]);
|
||||
}
|
||||
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
|
||||
return this._getClassWeight(header) < 0;
|
||||
});
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -1729,6 +1729,22 @@ Readability.prototype = {
|
|||
isProbablyReaderable: function(helperIsVisible) {
|
||||
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
|
||||
|
||||
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
|
||||
// Some articles' DOM structures might look like
|
||||
// <div>
|
||||
// Sentences<br>
|
||||
// <br>
|
||||
// Sentences<br>
|
||||
// </div>
|
||||
var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
|
||||
if (brNodes.length) {
|
||||
var set = new Set();
|
||||
[].forEach.call(brNodes, function(node) {
|
||||
set.add(node.parentNode);
|
||||
});
|
||||
nodes = [].concat.apply(Array.from(set), nodes);
|
||||
}
|
||||
|
||||
// FIXME we should have a fallback for helperIsVisible, but this is
|
||||
// problematic because of jsdom's elem.style handling - see
|
||||
// https://github.com/mozilla/readability/pull/186 for context.
|
||||
|
@ -1832,12 +1848,16 @@ Readability.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
return { uri: this._uri,
|
||||
title: articleTitle,
|
||||
byline: metadata.byline || this._articleByline,
|
||||
dir: this._articleDir,
|
||||
content: articleContent.innerHTML,
|
||||
length: articleContent.textContent.length,
|
||||
excerpt: metadata.excerpt };
|
||||
var textContent = articleContent.textContent;
|
||||
return {
|
||||
uri: this._uri,
|
||||
title: articleTitle,
|
||||
byline: metadata.byline || this._articleByline,
|
||||
dir: this._articleDir,
|
||||
content: articleContent.innerHTML,
|
||||
textContent: textContent,
|
||||
length: textContent.length,
|
||||
excerpt: metadata.excerpt,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче