зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1323861, Bug 1322674, Bug 1217007 - Update readability from github repo, r=Gijs
MozReview-Commit-ID: K0VcAPMaqBV --HG-- extra : rebase_source : a18e1a9093c59b6f42b9d04839ffa81d50f16b7c
This commit is contained in:
Родитель
0c91af7ef6
Коммит
e71fa05721
|
@ -1017,46 +1017,6 @@
|
|||
}
|
||||
},
|
||||
|
||||
readScript: function (node) {
|
||||
while (this.currentChar < this.html.length) {
|
||||
var c = this.nextChar();
|
||||
var nextC = this.peekNext();
|
||||
if (c === "<") {
|
||||
if (nextC === "!" || nextC === "?") {
|
||||
// We're still before the ! or ? that is starting this comment:
|
||||
this.currentChar++;
|
||||
node.appendChild(this.discardNextComment());
|
||||
continue;
|
||||
}
|
||||
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
|
||||
// Go back before the '<' so we find the end tag.
|
||||
this.currentChar--;
|
||||
// Done with this script tag, the caller will close:
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Either c wasn't a '<' or it was but we couldn't find either a comment
|
||||
// or a closing script tag, so we should just parse as text until the next one
|
||||
// comes along:
|
||||
|
||||
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
|
||||
var textNode = haveTextNode ? node.lastChild : new Text();
|
||||
var n = this.html.indexOf("<", this.currentChar);
|
||||
// Decrement this to include the current character *afterwards* so we don't get stuck
|
||||
// looking for the same < all the time.
|
||||
this.currentChar--;
|
||||
if (n === -1) {
|
||||
textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
|
||||
this.currentChar = this.html.length;
|
||||
} else {
|
||||
textNode.innerHTML += this.html.substring(this.currentChar, n);
|
||||
this.currentChar = n;
|
||||
}
|
||||
if (!haveTextNode)
|
||||
node.appendChild(textNode);
|
||||
}
|
||||
},
|
||||
|
||||
discardNextComment: function() {
|
||||
if (this.match("--")) {
|
||||
this.discardTo("-->");
|
||||
|
@ -1131,11 +1091,7 @@
|
|||
|
||||
// If this isn't a void Element, read its child nodes
|
||||
if (!closed) {
|
||||
if (localName == "script") {
|
||||
this.readScript(node);
|
||||
} else {
|
||||
this.readChildren(node);
|
||||
}
|
||||
this.readChildren(node);
|
||||
var closingTag = "</" + localName + ">";
|
||||
if (!this.match(closingTag)) {
|
||||
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
|
||||
|
|
|
@ -476,6 +476,11 @@ Readability.prototype = {
|
|||
_prepArticle: function(articleContent) {
|
||||
this._cleanStyles(articleContent);
|
||||
|
||||
// Check for data tables before we continue, to avoid removing items in
|
||||
// those tables, which will often be isolated even though they're
|
||||
// visually linked to other content-ful elements (text, images, etc.).
|
||||
this._markDataTables(articleContent);
|
||||
|
||||
// Clean out junk from the article content
|
||||
this._cleanConditionally(articleContent, "form");
|
||||
this._cleanConditionally(articleContent, "fieldset");
|
||||
|
@ -723,11 +728,11 @@ Readability.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
// Remove empty DIV, SECTION, and HEADER nodes
|
||||
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
|
||||
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
|
||||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
|
||||
this._isEmptyElement(node)) {
|
||||
this._isElementWithoutContent(node)) {
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
}
|
||||
|
@ -1186,10 +1191,11 @@ Readability.prototype = {
|
|||
});
|
||||
},
|
||||
|
||||
_isEmptyElement: function(node) {
|
||||
_isElementWithoutContent: function(node) {
|
||||
return node.nodeType === Node.ELEMENT_NODE &&
|
||||
node.children.length == 0 &&
|
||||
node.textContent.trim().length == 0;
|
||||
node.textContent.trim().length == 0 &&
|
||||
(node.children.length == 0 ||
|
||||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -1715,16 +1721,17 @@ Readability.prototype = {
|
|||
* @param HTMLElement node
|
||||
* @param String tagName
|
||||
* @param Number maxDepth
|
||||
* @param Function filterFn a filter to invoke to determine whether this node 'counts'
|
||||
* @return Boolean
|
||||
*/
|
||||
_hasAncestorTag: function(node, tagName, maxDepth) {
|
||||
_hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
|
||||
maxDepth = maxDepth || 3;
|
||||
tagName = tagName.toUpperCase();
|
||||
var depth = 0;
|
||||
while (node.parentNode) {
|
||||
if (depth > maxDepth)
|
||||
if (maxDepth > 0 && depth > maxDepth)
|
||||
return false;
|
||||
if (node.parentNode.tagName === tagName)
|
||||
if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
|
||||
return true;
|
||||
node = node.parentNode;
|
||||
depth++;
|
||||
|
@ -1732,6 +1739,93 @@ Readability.prototype = {
|
|||
return false;
|
||||
},
|
||||
|
||||
/**
|
||||
* Return an object indicating how many rows and columns this table has.
|
||||
*/
|
||||
_getRowAndColumnCount: function(table) {
|
||||
var rows = 0;
|
||||
var columns = 0;
|
||||
var trs = table.getElementsByTagName("tr");
|
||||
for (var i = 0; i < trs.length; i++) {
|
||||
var rowspan = trs[i].getAttribute("rowspan") || 0;
|
||||
if (rowspan) {
|
||||
rowspan = parseInt(rowspan, 10);
|
||||
}
|
||||
rows += (rowspan || 1);
|
||||
|
||||
// Now look for column-related info
|
||||
var columnsInThisRow = 0;
|
||||
var cells = trs[i].getElementsByTagName("td");
|
||||
for (var j = 0; j < cells.length; j++) {
|
||||
var colspan = cells[j].getAttribute("colspan") || 0;
|
||||
if (colspan) {
|
||||
colspan = parseInt(colspan, 10);
|
||||
}
|
||||
columnsInThisRow += (colspan || 1);
|
||||
}
|
||||
columns = Math.max(columns, columnsInThisRow);
|
||||
}
|
||||
return {rows: rows, columns: columns};
|
||||
},
|
||||
|
||||
/**
|
||||
* Look for 'data' (as opposed to 'layout') tables, for which we use
|
||||
* similar checks as
|
||||
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
|
||||
*/
|
||||
_markDataTables: function(root) {
|
||||
var tables = root.getElementsByTagName("table");
|
||||
for (var i = 0; i < tables.length; i++) {
|
||||
var table = tables[i];
|
||||
var role = table.getAttribute("role");
|
||||
if (role == "presentation") {
|
||||
table._readabilityDataTable = false;
|
||||
continue;
|
||||
}
|
||||
var datatable = table.getAttribute("datatable");
|
||||
if (datatable == "0") {
|
||||
table._readabilityDataTable = false;
|
||||
continue;
|
||||
}
|
||||
var summary = table.getAttribute("summary");
|
||||
if (summary) {
|
||||
table._readabilityDataTable = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
var caption = table.getElementsByTagName("caption")[0];
|
||||
if (caption && caption.childNodes.length > 0) {
|
||||
table._readabilityDataTable = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the table has a descendant with any of these tags, consider a data table:
|
||||
var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
|
||||
var descendantExists = function(tag) {
|
||||
return !!table.getElementsByTagName(tag)[0];
|
||||
};
|
||||
if (dataTableDescendants.some(descendantExists)) {
|
||||
this.log("Data table because found data-y descendant");
|
||||
table._readabilityDataTable = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Nested tables indicate a layout table:
|
||||
if (table.getElementsByTagName("table")[0]) {
|
||||
table._readabilityDataTable = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
var sizeInfo = this._getRowAndColumnCount(table);
|
||||
if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
|
||||
table._readabilityDataTable = true;
|
||||
continue;
|
||||
}
|
||||
// Now just go by size entirely:
|
||||
table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean an element of all tags of type "tag" if they look fishy.
|
||||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||
|
@ -1750,6 +1844,15 @@ Readability.prototype = {
|
|||
//
|
||||
// TODO: Consider taking into account original contentScore here.
|
||||
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
||||
// First check if we're in a data table, in which case don't remove us.
|
||||
var isDataTable = function(t) {
|
||||
return t._readabilityDataTable;
|
||||
};
|
||||
|
||||
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var weight = this._getClassWeight(node);
|
||||
var contentScore = 0;
|
||||
|
||||
|
@ -1765,7 +1868,7 @@ Readability.prototype = {
|
|||
// ominous signs, remove the element.
|
||||
var p = node.getElementsByTagName("p").length;
|
||||
var img = node.getElementsByTagName("img").length;
|
||||
var li = node.getElementsByTagName("li").length-100;
|
||||
var li = node.getElementsByTagName("li").length - 100;
|
||||
var input = node.getElementsByTagName("input").length;
|
||||
|
||||
var embedCount = 0;
|
||||
|
|
Загрузка…
Ссылка в новой задаче