Bug 1467742 - update readability from github rev 8fec62d246f563090f5711c02907e2bea7ce96f8, r=jaws

MozReview-Commit-ID: JPdLWPyToCb

--HG--
extra : rebase_source : 701ba8ee87bddbbcb4acb169615600162cc32d1b
This commit is contained in:
Gijs Kruitbosch 2018-06-08 11:57:03 +01:00
Родитель a20855c9f5
Коммит 69f3da3ddd
2 изменённых файлов: 162 добавлений и 82 удалений

Просмотреть файл

@ -618,6 +618,13 @@
};
var Element = function (tag) {
// We use this to find the closing tag.
this._matchingTag = tag;
// We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
var lastColonIndex = tag.lastIndexOf(":");
if (lastColonIndex != -1) {
tag = tag.substring(lastColonIndex + 1);
}
this.attributes = [];
this.childNodes = [];
this.children = [];
@ -785,7 +792,13 @@
break;
}
}
}
},
hasAttribute: function (name) {
return this.attributes.some(function (attr) {
return attr.name == name;
});
},
};
var Style = function (node) {
@ -1062,9 +1075,10 @@
return null;
// Read any text as Text node
var textNode;
if (c !== "<") {
--this.currentChar;
var textNode = new Text();
textNode = new Text();
var n = this.html.indexOf("<", this.currentChar);
if (n === -1) {
textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
@ -1076,6 +1090,18 @@
return textNode;
}
if (this.match("![CDATA[")) {
var endChar = this.html.indexOf("]]>", this.currentChar);
if (endChar === -1) {
this.error("unclosed CDATA section");
return null;
}
textNode = new Text();
textNode.textContent = this.html.substring(this.currentChar, endChar);
this.currentChar = endChar + ("]]>").length;
return textNode;
}
c = this.peekNext();
// Read Comment node. Normally, Comment nodes know their inner
@ -1107,7 +1133,7 @@
// If this isn't a void Element, read its child nodes
if (!closed) {
this.readChildren(node);
var closingTag = "</" + localName + ">";
var closingTag = "</" + node._matchingTag + ">";
if (!this.match(closingTag)) {
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
return null;

Просмотреть файл

@ -29,14 +29,19 @@
/**
* Public constructor.
* @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
function Readability(uri, doc, options) {
function Readability(doc, options) {
// In some older versions, people passed a URI as the first argument. Cope:
if (options && options.documentElement) {
doc = options;
options = arguments[2];
} else if (!doc || !doc.documentElement) {
throw new Error("First argument to Readability constructor should be a document object.");
}
options = options || {};
this._uri = uri;
this._doc = doc;
this._articleTitle = null;
this._articleByline = null;
@ -47,7 +52,7 @@ function Readability(uri, doc, options) {
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
@ -93,6 +98,10 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
// https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
ELEMENT_NODE: 1,
TEXT_NODE: 3,
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
@ -103,8 +112,8 @@ Readability.prototype = {
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// The default number of words an article must have in order to return a result
DEFAULT_WORD_THRESHOLD: 500,
// The default number of chars an article must have in order to return a result
DEFAULT_CHAR_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
@ -132,8 +141,19 @@ Readability.prototype = {
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
PHRASING_ELEMS: [
// "CANVAS", "IFRAME", "SVG", "VIDEO",
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
"DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
"MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
"RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
"SUP", "TEXTAREA", "TIME", "VAR", "WBR"
],
// These are the classes that readability sets itself.
CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
CLASSES_TO_PRESERVE: [ "page" ],
/**
* Run any post-process modifications to article content as necessary.
@ -215,6 +235,21 @@ Readability.prototype = {
return Array.prototype.some.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if all of the provided iterate
* function calls return true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return Boolean
*/
_everyNode: function(nodeList, fn) {
return Array.prototype.every.call(nodeList, fn, this);
},
/**
* Concat all nodelists passed as arguments.
*
@ -327,7 +362,7 @@ Readability.prototype = {
var origTitle = "";
try {
curTitle = origTitle = doc.title;
curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
@ -355,8 +390,9 @@ Readability.prototype = {
doc.getElementsByTagName('h1'),
doc.getElementsByTagName('h2')
);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
return heading.textContent === curTitle;
return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
@ -421,7 +457,7 @@ Readability.prototype = {
_nextElement: function (node) {
var next = node;
while (next
&& (next.nodeType != Node.ELEMENT_NODE)
&& (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
@ -464,16 +500,22 @@ Readability.prototype = {
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
var nextElem = this._nextElement(next);
var nextElem = this._nextElement(next.nextSibling);
if (nextElem && nextElem.tagName == "BR")
break;
}
if (!this._isPhrasingContent(next)) break;
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
}
});
},
@ -523,6 +565,7 @@ Readability.prototype = {
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@ -579,6 +622,19 @@ Readability.prototype = {
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
});
// Remove single-cell tables
this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
if (this._hasSingleTagInsideElement(tbody, "TR")) {
var row = tbody.firstElementChild;
if (this._hasSingleTagInsideElement(row, "TD")) {
var cell = row.firstElementChild;
cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
table.parentNode.replaceChild(cell, table);
}
}
});
},
/**
@ -658,37 +714,6 @@ Readability.prototype = {
return node && node.nextElementSibling;
},
/**
* Like _getNextNode, but for DOM implementations with no
* firstElementChild/nextElementSibling functionality...
*/
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
function nextSiblingEl(n) {
do {
n = n.nextSibling;
} while (n && n.nodeType !== n.ELEMENT_NODE);
return n;
}
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.children[0]) {
return node.children[0];
}
// Then for siblings...
var next = nextSiblingEl(node);
if (next) {
return next;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
if (node)
next = nextSiblingEl(node);
} while (node && !next);
return node && next;
},
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
@ -751,6 +776,12 @@ Readability.prototype = {
while (node) {
var matchString = node.className + " " + node.id;
if (!this._isProbablyVisible(node)) {
this.log("Removing hidden node - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
// Check to see if this node is a byline, and remove it if it is.
if (this._checkByline(node, matchString)) {
node = this._removeAndGetNext(node);
@ -784,11 +815,31 @@ Readability.prototype = {
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
// Put phrasing content into paragraphs.
var p = null;
var childNode = node.firstChild;
while (childNode) {
var nextSibling = childNode.nextSibling;
if (this._isPhrasingContent(childNode)) {
if (p !== null) {
p.appendChild(childNode);
} else if (!this._isWhitespace(childNode)) {
p = doc.createElement('p');
node.replaceChild(p, childNode);
p.appendChild(childNode);
}
} else if (p !== null) {
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
p = null;
}
childNode = nextSibling;
}
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if (this._hasSinglePInsideElement(node)) {
if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
@ -796,17 +847,6 @@ Readability.prototype = {
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
node.replaceChild(p, childNode);
}
});
}
}
node = this._getNextNode(node);
@ -846,7 +886,7 @@ Readability.prototype = {
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
if (!ancestor.tagName)
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
return;
if (typeof(ancestor.readability) === 'undefined') {
@ -1085,7 +1125,7 @@ Readability.prototype = {
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
var textLength = this._getInnerText(articleContent, true).length;
if (textLength < this._wordThreshold) {
if (textLength < this._charThreshold) {
parseSuccessful = false;
page.innerHTML = pageCacheHtml;
@ -1233,27 +1273,28 @@ Readability.prototype = {
},
/**
* Check if this node has only whitespace and a single P element
* Check if this node has only whitespace and a single element with given tag
* Returns false if the DIV node contains non-empty text nodes
* or if it contains no P or more than 1 element.
* or if it contains no element with given tag or more than 1 element.
*
* @param Element
* @param string tag of child element
**/
_hasSinglePInsideElement: function(element) {
// There should be exactly 1 element child which is a P:
if (element.children.length != 1 || element.children[0].tagName !== "P") {
_hasSingleTagInsideElement: function(element, tag) {
// There should be exactly 1 element child with given tag
if (element.children.length != 1 || element.children[0].tagName !== tag) {
return false;
}
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
return node.nodeType === Node.TEXT_NODE &&
return node.nodeType === this.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
_isElementWithoutContent: function(node) {
return node.nodeType === Node.ELEMENT_NODE &&
return node.nodeType === this.ELEMENT_NODE &&
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
@ -1271,6 +1312,21 @@ Readability.prototype = {
});
},
/***
* Determine if a node qualifies as phrasing content.
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
**/
_isPhrasingContent: function(node) {
return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
this._everyNode(node.childNodes, this._isPhrasingContent));
},
_isWhitespace: function(node) {
return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
(node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
},
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
@ -1312,16 +1368,14 @@ Readability.prototype = {
if (!e || e.tagName.toLowerCase() === 'svg')
return;
if (e.className !== 'readability-styled') {
// Remove `style` and deprecated presentational attributes
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
}
// Remove `style` and deprecated presentational attributes
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
}
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
e.removeAttribute('width');
e.removeAttribute('height');
}
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
e.removeAttribute('width');
e.removeAttribute('height');
}
var cur = e.firstElementChild;
@ -1639,6 +1693,10 @@ Readability.prototype = {
this._flags = this._flags & ~flag;
},
_isProbablyVisible: function(node) {
return node.style.display != "none" && !node.hasAttribute("hidden");
},
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
@ -1663,9 +1721,9 @@ Readability.prototype = {
nodes = [].concat.apply(Array.from(set), nodes);
}
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
if (!helperIsVisible) {
helperIsVisible = this._isProbablyVisible;
}
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
@ -1719,9 +1777,6 @@ Readability.prototype = {
}
}
if (typeof this._doc.documentElement.firstElementChild === "undefined") {
this._getNextNode = this._getNextNodeNoElementProperties;
}
// Remove script tags from the document.
this._removeScripts(this._doc);
@ -1750,7 +1805,6 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
uri: this._uri,
title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,