зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1323861, Bug 1322674, Bug 1217007 - Update readability from github repo, r=Gijs
MozReview-Commit-ID: K0VcAPMaqBV --HG-- extra : rebase_source : a18e1a9093c59b6f42b9d04839ffa81d50f16b7c
This commit is contained in:
Родитель
0c91af7ef6
Коммит
e71fa05721
|
@ -1017,46 +1017,6 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
readScript: function (node) {
|
|
||||||
while (this.currentChar < this.html.length) {
|
|
||||||
var c = this.nextChar();
|
|
||||||
var nextC = this.peekNext();
|
|
||||||
if (c === "<") {
|
|
||||||
if (nextC === "!" || nextC === "?") {
|
|
||||||
// We're still before the ! or ? that is starting this comment:
|
|
||||||
this.currentChar++;
|
|
||||||
node.appendChild(this.discardNextComment());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
|
|
||||||
// Go back before the '<' so we find the end tag.
|
|
||||||
this.currentChar--;
|
|
||||||
// Done with this script tag, the caller will close:
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Either c wasn't a '<' or it was but we couldn't find either a comment
|
|
||||||
// or a closing script tag, so we should just parse as text until the next one
|
|
||||||
// comes along:
|
|
||||||
|
|
||||||
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
|
|
||||||
var textNode = haveTextNode ? node.lastChild : new Text();
|
|
||||||
var n = this.html.indexOf("<", this.currentChar);
|
|
||||||
// Decrement this to include the current character *afterwards* so we don't get stuck
|
|
||||||
// looking for the same < all the time.
|
|
||||||
this.currentChar--;
|
|
||||||
if (n === -1) {
|
|
||||||
textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
|
|
||||||
this.currentChar = this.html.length;
|
|
||||||
} else {
|
|
||||||
textNode.innerHTML += this.html.substring(this.currentChar, n);
|
|
||||||
this.currentChar = n;
|
|
||||||
}
|
|
||||||
if (!haveTextNode)
|
|
||||||
node.appendChild(textNode);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
discardNextComment: function() {
|
discardNextComment: function() {
|
||||||
if (this.match("--")) {
|
if (this.match("--")) {
|
||||||
this.discardTo("-->");
|
this.discardTo("-->");
|
||||||
|
@ -1131,11 +1091,7 @@
|
||||||
|
|
||||||
// If this isn't a void Element, read its child nodes
|
// If this isn't a void Element, read its child nodes
|
||||||
if (!closed) {
|
if (!closed) {
|
||||||
if (localName == "script") {
|
this.readChildren(node);
|
||||||
this.readScript(node);
|
|
||||||
} else {
|
|
||||||
this.readChildren(node);
|
|
||||||
}
|
|
||||||
var closingTag = "</" + localName + ">";
|
var closingTag = "</" + localName + ">";
|
||||||
if (!this.match(closingTag)) {
|
if (!this.match(closingTag)) {
|
||||||
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
|
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
|
||||||
|
|
|
@ -476,6 +476,11 @@ Readability.prototype = {
|
||||||
_prepArticle: function(articleContent) {
|
_prepArticle: function(articleContent) {
|
||||||
this._cleanStyles(articleContent);
|
this._cleanStyles(articleContent);
|
||||||
|
|
||||||
|
// Check for data tables before we continue, to avoid removing items in
|
||||||
|
// those tables, which will often be isolated even though they're
|
||||||
|
// visually linked to other content-ful elements (text, images, etc.).
|
||||||
|
this._markDataTables(articleContent);
|
||||||
|
|
||||||
// Clean out junk from the article content
|
// Clean out junk from the article content
|
||||||
this._cleanConditionally(articleContent, "form");
|
this._cleanConditionally(articleContent, "form");
|
||||||
this._cleanConditionally(articleContent, "fieldset");
|
this._cleanConditionally(articleContent, "fieldset");
|
||||||
|
@ -723,11 +728,11 @@ Readability.prototype = {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove empty DIV, SECTION, and HEADER nodes
|
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||||
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
|
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
|
||||||
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
|
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
|
||||||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
|
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
|
||||||
this._isEmptyElement(node)) {
|
this._isElementWithoutContent(node)) {
|
||||||
node = this._removeAndGetNext(node);
|
node = this._removeAndGetNext(node);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1186,10 +1191,11 @@ Readability.prototype = {
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
_isEmptyElement: function(node) {
|
_isElementWithoutContent: function(node) {
|
||||||
return node.nodeType === Node.ELEMENT_NODE &&
|
return node.nodeType === Node.ELEMENT_NODE &&
|
||||||
node.children.length == 0 &&
|
node.textContent.trim().length == 0 &&
|
||||||
node.textContent.trim().length == 0;
|
(node.children.length == 0 ||
|
||||||
|
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1715,16 +1721,17 @@ Readability.prototype = {
|
||||||
* @param HTMLElement node
|
* @param HTMLElement node
|
||||||
* @param String tagName
|
* @param String tagName
|
||||||
* @param Number maxDepth
|
* @param Number maxDepth
|
||||||
|
* @param Function filterFn a filter to invoke to determine whether this node 'counts'
|
||||||
* @return Boolean
|
* @return Boolean
|
||||||
*/
|
*/
|
||||||
_hasAncestorTag: function(node, tagName, maxDepth) {
|
_hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
|
||||||
maxDepth = maxDepth || 3;
|
maxDepth = maxDepth || 3;
|
||||||
tagName = tagName.toUpperCase();
|
tagName = tagName.toUpperCase();
|
||||||
var depth = 0;
|
var depth = 0;
|
||||||
while (node.parentNode) {
|
while (node.parentNode) {
|
||||||
if (depth > maxDepth)
|
if (maxDepth > 0 && depth > maxDepth)
|
||||||
return false;
|
return false;
|
||||||
if (node.parentNode.tagName === tagName)
|
if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
|
||||||
return true;
|
return true;
|
||||||
node = node.parentNode;
|
node = node.parentNode;
|
||||||
depth++;
|
depth++;
|
||||||
|
@ -1732,6 +1739,93 @@ Readability.prototype = {
|
||||||
return false;
|
return false;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return an object indicating how many rows and columns this table has.
|
||||||
|
*/
|
||||||
|
_getRowAndColumnCount: function(table) {
|
||||||
|
var rows = 0;
|
||||||
|
var columns = 0;
|
||||||
|
var trs = table.getElementsByTagName("tr");
|
||||||
|
for (var i = 0; i < trs.length; i++) {
|
||||||
|
var rowspan = trs[i].getAttribute("rowspan") || 0;
|
||||||
|
if (rowspan) {
|
||||||
|
rowspan = parseInt(rowspan, 10);
|
||||||
|
}
|
||||||
|
rows += (rowspan || 1);
|
||||||
|
|
||||||
|
// Now look for column-related info
|
||||||
|
var columnsInThisRow = 0;
|
||||||
|
var cells = trs[i].getElementsByTagName("td");
|
||||||
|
for (var j = 0; j < cells.length; j++) {
|
||||||
|
var colspan = cells[j].getAttribute("colspan") || 0;
|
||||||
|
if (colspan) {
|
||||||
|
colspan = parseInt(colspan, 10);
|
||||||
|
}
|
||||||
|
columnsInThisRow += (colspan || 1);
|
||||||
|
}
|
||||||
|
columns = Math.max(columns, columnsInThisRow);
|
||||||
|
}
|
||||||
|
return {rows: rows, columns: columns};
|
||||||
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look for 'data' (as opposed to 'layout') tables, for which we use
|
||||||
|
* similar checks as
|
||||||
|
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
|
||||||
|
*/
|
||||||
|
_markDataTables: function(root) {
|
||||||
|
var tables = root.getElementsByTagName("table");
|
||||||
|
for (var i = 0; i < tables.length; i++) {
|
||||||
|
var table = tables[i];
|
||||||
|
var role = table.getAttribute("role");
|
||||||
|
if (role == "presentation") {
|
||||||
|
table._readabilityDataTable = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var datatable = table.getAttribute("datatable");
|
||||||
|
if (datatable == "0") {
|
||||||
|
table._readabilityDataTable = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var summary = table.getAttribute("summary");
|
||||||
|
if (summary) {
|
||||||
|
table._readabilityDataTable = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var caption = table.getElementsByTagName("caption")[0];
|
||||||
|
if (caption && caption.childNodes.length > 0) {
|
||||||
|
table._readabilityDataTable = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the table has a descendant with any of these tags, consider a data table:
|
||||||
|
var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
|
||||||
|
var descendantExists = function(tag) {
|
||||||
|
return !!table.getElementsByTagName(tag)[0];
|
||||||
|
};
|
||||||
|
if (dataTableDescendants.some(descendantExists)) {
|
||||||
|
this.log("Data table because found data-y descendant");
|
||||||
|
table._readabilityDataTable = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nested tables indicate a layout table:
|
||||||
|
if (table.getElementsByTagName("table")[0]) {
|
||||||
|
table._readabilityDataTable = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var sizeInfo = this._getRowAndColumnCount(table);
|
||||||
|
if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
|
||||||
|
table._readabilityDataTable = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Now just go by size entirely:
|
||||||
|
table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean an element of all tags of type "tag" if they look fishy.
|
* Clean an element of all tags of type "tag" if they look fishy.
|
||||||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||||
|
@ -1750,6 +1844,15 @@ Readability.prototype = {
|
||||||
//
|
//
|
||||||
// TODO: Consider taking into account original contentScore here.
|
// TODO: Consider taking into account original contentScore here.
|
||||||
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
||||||
|
// First check if we're in a data table, in which case don't remove us.
|
||||||
|
var isDataTable = function(t) {
|
||||||
|
return t._readabilityDataTable;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
var weight = this._getClassWeight(node);
|
var weight = this._getClassWeight(node);
|
||||||
var contentScore = 0;
|
var contentScore = 0;
|
||||||
|
|
||||||
|
@ -1765,7 +1868,7 @@ Readability.prototype = {
|
||||||
// ominous signs, remove the element.
|
// ominous signs, remove the element.
|
||||||
var p = node.getElementsByTagName("p").length;
|
var p = node.getElementsByTagName("p").length;
|
||||||
var img = node.getElementsByTagName("img").length;
|
var img = node.getElementsByTagName("img").length;
|
||||||
var li = node.getElementsByTagName("li").length-100;
|
var li = node.getElementsByTagName("li").length - 100;
|
||||||
var input = node.getElementsByTagName("input").length;
|
var input = node.getElementsByTagName("input").length;
|
||||||
|
|
||||||
var embedCount = 0;
|
var embedCount = 0;
|
||||||
|
|
Загрузка…
Ссылка в новой задаче