зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1467742 - update readability from github rev 8fec62d246f563090f5711c02907e2bea7ce96f8, r=jaws
MozReview-Commit-ID: JPdLWPyToCb --HG-- extra : rebase_source : 701ba8ee87bddbbcb4acb169615600162cc32d1b
This commit is contained in:
Родитель
a20855c9f5
Коммит
69f3da3ddd
|
@ -618,6 +618,13 @@
|
|||
};
|
||||
|
||||
var Element = function (tag) {
|
||||
// We use this to find the closing tag.
|
||||
this._matchingTag = tag;
|
||||
// We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
|
||||
var lastColonIndex = tag.lastIndexOf(":");
|
||||
if (lastColonIndex != -1) {
|
||||
tag = tag.substring(lastColonIndex + 1);
|
||||
}
|
||||
this.attributes = [];
|
||||
this.childNodes = [];
|
||||
this.children = [];
|
||||
|
@ -785,7 +792,13 @@
|
|||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
hasAttribute: function (name) {
|
||||
return this.attributes.some(function (attr) {
|
||||
return attr.name == name;
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
var Style = function (node) {
|
||||
|
@ -1062,9 +1075,10 @@
|
|||
return null;
|
||||
|
||||
// Read any text as Text node
|
||||
var textNode;
|
||||
if (c !== "<") {
|
||||
--this.currentChar;
|
||||
var textNode = new Text();
|
||||
textNode = new Text();
|
||||
var n = this.html.indexOf("<", this.currentChar);
|
||||
if (n === -1) {
|
||||
textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
|
||||
|
@ -1076,6 +1090,18 @@
|
|||
return textNode;
|
||||
}
|
||||
|
||||
if (this.match("![CDATA[")) {
|
||||
var endChar = this.html.indexOf("]]>", this.currentChar);
|
||||
if (endChar === -1) {
|
||||
this.error("unclosed CDATA section");
|
||||
return null;
|
||||
}
|
||||
textNode = new Text();
|
||||
textNode.textContent = this.html.substring(this.currentChar, endChar);
|
||||
this.currentChar = endChar + ("]]>").length;
|
||||
return textNode;
|
||||
}
|
||||
|
||||
c = this.peekNext();
|
||||
|
||||
// Read Comment node. Normally, Comment nodes know their inner
|
||||
|
@ -1107,7 +1133,7 @@
|
|||
// If this isn't a void Element, read its child nodes
|
||||
if (!closed) {
|
||||
this.readChildren(node);
|
||||
var closingTag = "</" + localName + ">";
|
||||
var closingTag = "</" + node._matchingTag + ">";
|
||||
if (!this.match(closingTag)) {
|
||||
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
|
||||
return null;
|
||||
|
|
|
@ -29,14 +29,19 @@
|
|||
|
||||
/**
|
||||
* Public constructor.
|
||||
* @param {Object} uri The URI descriptor object.
|
||||
* @param {HTMLDocument} doc The document to parse.
|
||||
* @param {Object} options The options object.
|
||||
*/
|
||||
function Readability(uri, doc, options) {
|
||||
function Readability(doc, options) {
|
||||
// In some older versions, people passed a URI as the first argument. Cope:
|
||||
if (options && options.documentElement) {
|
||||
doc = options;
|
||||
options = arguments[2];
|
||||
} else if (!doc || !doc.documentElement) {
|
||||
throw new Error("First argument to Readability constructor should be a document object.");
|
||||
}
|
||||
options = options || {};
|
||||
|
||||
this._uri = uri;
|
||||
this._doc = doc;
|
||||
this._articleTitle = null;
|
||||
this._articleByline = null;
|
||||
|
@ -47,7 +52,7 @@ function Readability(uri, doc, options) {
|
|||
this._debug = !!options.debug;
|
||||
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
|
||||
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
|
||||
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
|
||||
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
|
||||
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
||||
|
||||
// Start with all flags set
|
||||
|
@ -93,6 +98,10 @@ Readability.prototype = {
|
|||
FLAG_WEIGHT_CLASSES: 0x2,
|
||||
FLAG_CLEAN_CONDITIONALLY: 0x4,
|
||||
|
||||
// https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
|
||||
ELEMENT_NODE: 1,
|
||||
TEXT_NODE: 3,
|
||||
|
||||
// Max number of nodes supported by this parser. Default: 0 (no limit)
|
||||
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
|
||||
|
||||
|
@ -103,8 +112,8 @@ Readability.prototype = {
|
|||
// Element tags to score by default.
|
||||
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
|
||||
|
||||
// The default number of words an article must have in order to return a result
|
||||
DEFAULT_WORD_THRESHOLD: 500,
|
||||
// The default number of chars an article must have in order to return a result
|
||||
DEFAULT_CHAR_THRESHOLD: 500,
|
||||
|
||||
// All of the regular expressions in use within readability.
|
||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||
|
@ -132,8 +141,19 @@ Readability.prototype = {
|
|||
|
||||
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
|
||||
|
||||
// The commented out elements qualify as phrasing content but tend to be
|
||||
// removed by readability when put into paragraphs, so we ignore them here.
|
||||
PHRASING_ELEMS: [
|
||||
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
||||
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
|
||||
"DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
|
||||
"MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
|
||||
"RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
|
||||
"SUP", "TEXTAREA", "TIME", "VAR", "WBR"
|
||||
],
|
||||
|
||||
// These are the classes that readability sets itself.
|
||||
CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
|
||||
CLASSES_TO_PRESERVE: [ "page" ],
|
||||
|
||||
/**
|
||||
* Run any post-process modifications to article content as necessary.
|
||||
|
@ -215,6 +235,21 @@ Readability.prototype = {
|
|||
return Array.prototype.some.call(nodeList, fn, this);
|
||||
},
|
||||
|
||||
/**
|
||||
* Iterate over a NodeList, return true if all of the provided iterate
|
||||
* function calls return true, false otherwise.
|
||||
*
|
||||
* For convenience, the current object context is applied to the
|
||||
* provided iterate function.
|
||||
*
|
||||
* @param NodeList nodeList The NodeList.
|
||||
* @param Function fn The iterate function.
|
||||
* @return Boolean
|
||||
*/
|
||||
_everyNode: function(nodeList, fn) {
|
||||
return Array.prototype.every.call(nodeList, fn, this);
|
||||
},
|
||||
|
||||
/**
|
||||
* Concat all nodelists passed as arguments.
|
||||
*
|
||||
|
@ -327,7 +362,7 @@ Readability.prototype = {
|
|||
var origTitle = "";
|
||||
|
||||
try {
|
||||
curTitle = origTitle = doc.title;
|
||||
curTitle = origTitle = doc.title.trim();
|
||||
|
||||
// If they had an element with id "title" in their HTML
|
||||
if (typeof curTitle !== "string")
|
||||
|
@ -355,8 +390,9 @@ Readability.prototype = {
|
|||
doc.getElementsByTagName('h1'),
|
||||
doc.getElementsByTagName('h2')
|
||||
);
|
||||
var trimmedTitle = curTitle.trim();
|
||||
var match = this._someNode(headings, function(heading) {
|
||||
return heading.textContent === curTitle;
|
||||
return heading.textContent.trim() === trimmedTitle;
|
||||
});
|
||||
|
||||
// If we don't, let's extract the title out of the original title string.
|
||||
|
@ -421,7 +457,7 @@ Readability.prototype = {
|
|||
_nextElement: function (node) {
|
||||
var next = node;
|
||||
while (next
|
||||
&& (next.nodeType != Node.ELEMENT_NODE)
|
||||
&& (next.nodeType != this.ELEMENT_NODE)
|
||||
&& this.REGEXPS.whitespace.test(next.textContent)) {
|
||||
next = next.nextSibling;
|
||||
}
|
||||
|
@ -464,16 +500,22 @@ Readability.prototype = {
|
|||
while (next) {
|
||||
// If we've hit another <br><br>, we're done adding children to this <p>.
|
||||
if (next.tagName == "BR") {
|
||||
var nextElem = this._nextElement(next);
|
||||
var nextElem = this._nextElement(next.nextSibling);
|
||||
if (nextElem && nextElem.tagName == "BR")
|
||||
break;
|
||||
}
|
||||
|
||||
if (!this._isPhrasingContent(next)) break;
|
||||
|
||||
// Otherwise, make this node a child of the new <p>.
|
||||
var sibling = next.nextSibling;
|
||||
p.appendChild(next);
|
||||
next = sibling;
|
||||
}
|
||||
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
|
||||
|
||||
if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
|
||||
}
|
||||
});
|
||||
},
|
||||
|
@ -523,6 +565,7 @@ Readability.prototype = {
|
|||
this._clean(articleContent, "h1");
|
||||
this._clean(articleContent, "footer");
|
||||
this._clean(articleContent, "link");
|
||||
this._clean(articleContent, "aside");
|
||||
|
||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
||||
// which means we don't remove the top candidates even they have "share".
|
||||
|
@ -579,6 +622,19 @@ Readability.prototype = {
|
|||
if (next && next.tagName == "P")
|
||||
br.parentNode.removeChild(br);
|
||||
});
|
||||
|
||||
// Remove single-cell tables
|
||||
this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
|
||||
var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
|
||||
if (this._hasSingleTagInsideElement(tbody, "TR")) {
|
||||
var row = tbody.firstElementChild;
|
||||
if (this._hasSingleTagInsideElement(row, "TD")) {
|
||||
var cell = row.firstElementChild;
|
||||
cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
|
||||
table.parentNode.replaceChild(cell, table);
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -658,37 +714,6 @@ Readability.prototype = {
|
|||
return node && node.nextElementSibling;
|
||||
},
|
||||
|
||||
/**
|
||||
* Like _getNextNode, but for DOM implementations with no
|
||||
* firstElementChild/nextElementSibling functionality...
|
||||
*/
|
||||
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
|
||||
function nextSiblingEl(n) {
|
||||
do {
|
||||
n = n.nextSibling;
|
||||
} while (n && n.nodeType !== n.ELEMENT_NODE);
|
||||
return n;
|
||||
}
|
||||
// First check for kids if those aren't being ignored
|
||||
if (!ignoreSelfAndKids && node.children[0]) {
|
||||
return node.children[0];
|
||||
}
|
||||
// Then for siblings...
|
||||
var next = nextSiblingEl(node);
|
||||
if (next) {
|
||||
return next;
|
||||
}
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
do {
|
||||
node = node.parentNode;
|
||||
if (node)
|
||||
next = nextSiblingEl(node);
|
||||
} while (node && !next);
|
||||
return node && next;
|
||||
},
|
||||
|
||||
_checkByline: function(node, matchString) {
|
||||
if (this._articleByline) {
|
||||
return false;
|
||||
|
@ -751,6 +776,12 @@ Readability.prototype = {
|
|||
while (node) {
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
if (!this._isProbablyVisible(node)) {
|
||||
this.log("Removing hidden node - " + matchString);
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check to see if this node is a byline, and remove it if it is.
|
||||
if (this._checkByline(node, matchString)) {
|
||||
node = this._removeAndGetNext(node);
|
||||
|
@ -784,11 +815,31 @@ Readability.prototype = {
|
|||
|
||||
// Turn all divs that don't have children block level elements into p's
|
||||
if (node.tagName === "DIV") {
|
||||
// Put phrasing content into paragraphs.
|
||||
var p = null;
|
||||
var childNode = node.firstChild;
|
||||
while (childNode) {
|
||||
var nextSibling = childNode.nextSibling;
|
||||
if (this._isPhrasingContent(childNode)) {
|
||||
if (p !== null) {
|
||||
p.appendChild(childNode);
|
||||
} else if (!this._isWhitespace(childNode)) {
|
||||
p = doc.createElement('p');
|
||||
node.replaceChild(p, childNode);
|
||||
p.appendChild(childNode);
|
||||
}
|
||||
} else if (p !== null) {
|
||||
while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
|
||||
p = null;
|
||||
}
|
||||
childNode = nextSibling;
|
||||
}
|
||||
|
||||
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
|
||||
// element. DIVs with only a P element inside and no text content can be
|
||||
// safely converted into plain P elements to avoid confusing the scoring
|
||||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
if (this._hasSinglePInsideElement(node)) {
|
||||
if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
|
||||
var newNode = node.children[0];
|
||||
node.parentNode.replaceChild(newNode, node);
|
||||
node = newNode;
|
||||
|
@ -796,17 +847,6 @@ Readability.prototype = {
|
|||
} else if (!this._hasChildBlockElement(node)) {
|
||||
node = this._setNodeTag(node, "P");
|
||||
elementsToScore.push(node);
|
||||
} else {
|
||||
// EXPERIMENTAL
|
||||
this._forEachNode(node.childNodes, function(childNode) {
|
||||
if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
|
||||
var p = doc.createElement('p');
|
||||
p.textContent = childNode.textContent;
|
||||
p.style.display = 'inline';
|
||||
p.className = 'readability-styled';
|
||||
node.replaceChild(p, childNode);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
node = this._getNextNode(node);
|
||||
|
@ -846,7 +886,7 @@ Readability.prototype = {
|
|||
|
||||
// Initialize and score ancestors.
|
||||
this._forEachNode(ancestors, function(ancestor, level) {
|
||||
if (!ancestor.tagName)
|
||||
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
|
||||
return;
|
||||
|
||||
if (typeof(ancestor.readability) === 'undefined') {
|
||||
|
@ -1085,7 +1125,7 @@ Readability.prototype = {
|
|||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
var textLength = this._getInnerText(articleContent, true).length;
|
||||
if (textLength < this._wordThreshold) {
|
||||
if (textLength < this._charThreshold) {
|
||||
parseSuccessful = false;
|
||||
page.innerHTML = pageCacheHtml;
|
||||
|
||||
|
@ -1233,27 +1273,28 @@ Readability.prototype = {
|
|||
},
|
||||
|
||||
/**
|
||||
* Check if this node has only whitespace and a single P element
|
||||
* Check if this node has only whitespace and a single element with given tag
|
||||
* Returns false if the DIV node contains non-empty text nodes
|
||||
* or if it contains no P or more than 1 element.
|
||||
* or if it contains no element with given tag or more than 1 element.
|
||||
*
|
||||
* @param Element
|
||||
* @param string tag of child element
|
||||
**/
|
||||
_hasSinglePInsideElement: function(element) {
|
||||
// There should be exactly 1 element child which is a P:
|
||||
if (element.children.length != 1 || element.children[0].tagName !== "P") {
|
||||
_hasSingleTagInsideElement: function(element, tag) {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if (element.children.length != 1 || element.children[0].tagName !== tag) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
return !this._someNode(element.childNodes, function(node) {
|
||||
return node.nodeType === Node.TEXT_NODE &&
|
||||
return node.nodeType === this.TEXT_NODE &&
|
||||
this.REGEXPS.hasContent.test(node.textContent);
|
||||
});
|
||||
},
|
||||
|
||||
_isElementWithoutContent: function(node) {
|
||||
return node.nodeType === Node.ELEMENT_NODE &&
|
||||
return node.nodeType === this.ELEMENT_NODE &&
|
||||
node.textContent.trim().length == 0 &&
|
||||
(node.children.length == 0 ||
|
||||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
|
||||
|
@ -1271,6 +1312,21 @@ Readability.prototype = {
|
|||
});
|
||||
},
|
||||
|
||||
/***
|
||||
* Determine if a node qualifies as phrasing content.
|
||||
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
|
||||
**/
|
||||
_isPhrasingContent: function(node) {
|
||||
return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
|
||||
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
|
||||
this._everyNode(node.childNodes, this._isPhrasingContent));
|
||||
},
|
||||
|
||||
_isWhitespace: function(node) {
|
||||
return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
|
||||
(node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
|
||||
},
|
||||
|
||||
/**
|
||||
* Get the inner text of a node - cross browser compatibly.
|
||||
* This also strips out any excess whitespace to be found.
|
||||
|
@ -1312,16 +1368,14 @@ Readability.prototype = {
|
|||
if (!e || e.tagName.toLowerCase() === 'svg')
|
||||
return;
|
||||
|
||||
if (e.className !== 'readability-styled') {
|
||||
// Remove `style` and deprecated presentational attributes
|
||||
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
|
||||
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
|
||||
}
|
||||
// Remove `style` and deprecated presentational attributes
|
||||
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
|
||||
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
|
||||
}
|
||||
|
||||
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
|
||||
e.removeAttribute('width');
|
||||
e.removeAttribute('height');
|
||||
}
|
||||
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
|
||||
e.removeAttribute('width');
|
||||
e.removeAttribute('height');
|
||||
}
|
||||
|
||||
var cur = e.firstElementChild;
|
||||
|
@ -1639,6 +1693,10 @@ Readability.prototype = {
|
|||
this._flags = this._flags & ~flag;
|
||||
},
|
||||
|
||||
_isProbablyVisible: function(node) {
|
||||
return node.style.display != "none" && !node.hasAttribute("hidden");
|
||||
},
|
||||
|
||||
/**
|
||||
* Decides whether or not the document is reader-able without parsing the whole thing.
|
||||
*
|
||||
|
@ -1663,9 +1721,9 @@ Readability.prototype = {
|
|||
nodes = [].concat.apply(Array.from(set), nodes);
|
||||
}
|
||||
|
||||
// FIXME we should have a fallback for helperIsVisible, but this is
|
||||
// problematic because of jsdom's elem.style handling - see
|
||||
// https://github.com/mozilla/readability/pull/186 for context.
|
||||
if (!helperIsVisible) {
|
||||
helperIsVisible = this._isProbablyVisible;
|
||||
}
|
||||
|
||||
var score = 0;
|
||||
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
|
||||
|
@ -1719,9 +1777,6 @@ Readability.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
if (typeof this._doc.documentElement.firstElementChild === "undefined") {
|
||||
this._getNextNode = this._getNextNodeNoElementProperties;
|
||||
}
|
||||
// Remove script tags from the document.
|
||||
this._removeScripts(this._doc);
|
||||
|
||||
|
@ -1750,7 +1805,6 @@ Readability.prototype = {
|
|||
|
||||
var textContent = articleContent.textContent;
|
||||
return {
|
||||
uri: this._uri,
|
||||
title: this._articleTitle,
|
||||
byline: metadata.byline || this._articleByline,
|
||||
dir: this._articleDir,
|
||||
|
|
Загрузка…
Ссылка в новой задаче