Bug 1323861, Bug 1322674, Bug 1217007 - Update readability from github repo, r=Gijs

MozReview-Commit-ID: K0VcAPMaqBV

--HG--
extra : rebase_source : a18e1a9093c59b6f42b9d04839ffa81d50f16b7c
This commit is contained in:
Evan Tseng 2017-02-23 16:35:06 +08:00
Родитель 0c91af7ef6
Коммит e71fa05721
2 изменённых файлов: 113 добавлений и 54 удалений

Просмотреть файл

@ -1017,46 +1017,6 @@
}
},
readScript: function (node) {
while (this.currentChar < this.html.length) {
var c = this.nextChar();
var nextC = this.peekNext();
if (c === "<") {
if (nextC === "!" || nextC === "?") {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
node.appendChild(this.discardNextComment());
continue;
}
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
// Go back before the '<' so we find the end tag.
this.currentChar--;
// Done with this script tag, the caller will close:
return;
}
}
// Either c wasn't a '<' or it was but we couldn't find either a comment
// or a closing script tag, so we should just parse as text until the next one
// comes along:
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
var textNode = haveTextNode ? node.lastChild : new Text();
var n = this.html.indexOf("<", this.currentChar);
// Decrement this to include the current character *afterwards* so we don't get stuck
// looking for the same < all the time.
this.currentChar--;
if (n === -1) {
textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
textNode.innerHTML += this.html.substring(this.currentChar, n);
this.currentChar = n;
}
if (!haveTextNode)
node.appendChild(textNode);
}
},
discardNextComment: function() {
if (this.match("--")) {
this.discardTo("-->");
@ -1131,11 +1091,7 @@
// If this isn't a void Element, read its child nodes
if (!closed) {
if (localName == "script") {
this.readScript(node);
} else {
this.readChildren(node);
}
this.readChildren(node);
var closingTag = "</" + localName + ">";
if (!this.match(closingTag)) {
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));

Просмотреть файл

@ -476,6 +476,11 @@ Readability.prototype = {
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
// Check for data tables before we continue, to avoid removing items in
// those tables, which will often be isolated even though they're
// visually linked to other content-ful elements (text, images, etc.).
this._markDataTables(articleContent);
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
@ -723,11 +728,11 @@ Readability.prototype = {
}
}
// Remove empty DIV, SECTION, and HEADER nodes
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
this._isEmptyElement(node)) {
this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
}
@ -1186,10 +1191,11 @@ Readability.prototype = {
});
},
_isEmptyElement: function(node) {
_isElementWithoutContent: function(node) {
return node.nodeType === Node.ELEMENT_NODE &&
node.children.length == 0 &&
node.textContent.trim().length == 0;
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
},
/**
@ -1715,16 +1721,17 @@ Readability.prototype = {
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
* @param Function filterFn a filter to invoke to determine whether this node 'counts'
* @return Boolean
*/
_hasAncestorTag: function(node, tagName, maxDepth) {
_hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
if (depth > maxDepth)
if (maxDepth > 0 && depth > maxDepth)
return false;
if (node.parentNode.tagName === tagName)
if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
return true;
node = node.parentNode;
depth++;
@ -1732,6 +1739,93 @@ Readability.prototype = {
return false;
},
/**
* Return an object indicating how many rows and columns this table has.
*/
_getRowAndColumnCount: function(table) {
var rows = 0;
var columns = 0;
var trs = table.getElementsByTagName("tr");
for (var i = 0; i < trs.length; i++) {
var rowspan = trs[i].getAttribute("rowspan") || 0;
if (rowspan) {
rowspan = parseInt(rowspan, 10);
}
rows += (rowspan || 1);
// Now look for column-related info
var columnsInThisRow = 0;
var cells = trs[i].getElementsByTagName("td");
for (var j = 0; j < cells.length; j++) {
var colspan = cells[j].getAttribute("colspan") || 0;
if (colspan) {
colspan = parseInt(colspan, 10);
}
columnsInThisRow += (colspan || 1);
}
columns = Math.max(columns, columnsInThisRow);
}
return {rows: rows, columns: columns};
},
/**
* Look for 'data' (as opposed to 'layout') tables, for which we use
* similar checks as
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
*/
_markDataTables: function(root) {
var tables = root.getElementsByTagName("table");
for (var i = 0; i < tables.length; i++) {
var table = tables[i];
var role = table.getAttribute("role");
if (role == "presentation") {
table._readabilityDataTable = false;
continue;
}
var datatable = table.getAttribute("datatable");
if (datatable == "0") {
table._readabilityDataTable = false;
continue;
}
var summary = table.getAttribute("summary");
if (summary) {
table._readabilityDataTable = true;
continue;
}
var caption = table.getElementsByTagName("caption")[0];
if (caption && caption.childNodes.length > 0) {
table._readabilityDataTable = true;
continue;
}
// If the table has a descendant with any of these tags, consider a data table:
var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
var descendantExists = function(tag) {
return !!table.getElementsByTagName(tag)[0];
};
if (dataTableDescendants.some(descendantExists)) {
this.log("Data table because found data-y descendant");
table._readabilityDataTable = true;
continue;
}
// Nested tables indicate a layout table:
if (table.getElementsByTagName("table")[0]) {
table._readabilityDataTable = false;
continue;
}
var sizeInfo = this._getRowAndColumnCount(table);
if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
table._readabilityDataTable = true;
continue;
}
// Now just go by size entirely:
table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
}
},
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@ -1750,6 +1844,15 @@ Readability.prototype = {
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
// First check if we're in a data table, in which case don't remove us.
var isDataTable = function(t) {
return t._readabilityDataTable;
};
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
return false;
}
var weight = this._getClassWeight(node);
var contentScore = 0;
@ -1765,7 +1868,7 @@ Readability.prototype = {
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length-100;
var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;