зеркало из https://github.com/mozilla/gecko-dev.git
No bug, update Readability and JSDOMParser to github tip, rs=me
--HG-- extra : rebase_source : dbfc50e5de39ea8ac7190d48af10121a91d77411
This commit is contained in:
Родитель
b0e85afe73
Коммит
352a7c0220
|
@ -615,6 +615,7 @@
|
|||
this.childNodes[i].parentNode = null;
|
||||
}
|
||||
this.childNodes = node.childNodes;
|
||||
this.children = node.children;
|
||||
for (var i = this.childNodes.length; --i >= 0;) {
|
||||
this.childNodes[i].parentNode = this;
|
||||
}
|
||||
|
@ -628,6 +629,7 @@
|
|||
|
||||
var node = new Text();
|
||||
this.childNodes = [ node ];
|
||||
this.children = [];
|
||||
node.textContent = text;
|
||||
node.parentNode = this;
|
||||
},
|
||||
|
@ -924,14 +926,59 @@
|
|||
},
|
||||
|
||||
readScript: function (node) {
|
||||
var index = this.html.indexOf("</script>", this.currentChar);
|
||||
if (index === -1) {
|
||||
index = this.html.length;
|
||||
while (this.currentChar < this.html.length) {
|
||||
var c = this.nextChar();
|
||||
var nextC = this.peekNext();
|
||||
if (c === "<") {
|
||||
if (nextC === "!" || nextC === "?") {
|
||||
// We're still before the ! or ? that is starting this comment:
|
||||
this.currentChar++;
|
||||
node.appendChild(this.discardNextComment());
|
||||
continue;
|
||||
}
|
||||
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
|
||||
// Go back before the '<' so we find the end tag.
|
||||
this.currentChar--;
|
||||
// Done with this script tag, the caller will close:
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Either c wasn't a '<' or it was but we couldn't find either a comment
|
||||
// or a closing script tag, so we should just parse as text until the next one
|
||||
// comes along:
|
||||
|
||||
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
|
||||
var textNode = haveTextNode ? node.lastChild : new Text();
|
||||
var n = this.html.indexOf("<", this.currentChar);
|
||||
// Decrement this to include the current character *afterwards* so we don't get stuck
|
||||
// looking for the same < all the time.
|
||||
this.currentChar--;
|
||||
if (n === -1) {
|
||||
textNode.textContent += this.html.substring(this.currentChar, this.html.length);
|
||||
this.currentChar = this.html.length;
|
||||
} else {
|
||||
textNode.textContent += this.html.substring(this.currentChar, n);
|
||||
this.currentChar = n;
|
||||
}
|
||||
if (!haveTextNode)
|
||||
node.appendChild(textNode);
|
||||
}
|
||||
var txt = new Text();
|
||||
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
|
||||
node.appendChild(txt);
|
||||
this.currentChar = index;
|
||||
},
|
||||
|
||||
discardNextComment: function() {
|
||||
if (this.match("--")) {
|
||||
this.discardTo("-->");
|
||||
} else {
|
||||
var c = this.nextChar();
|
||||
while (c !== ">") {
|
||||
if (c === undefined)
|
||||
return null;
|
||||
if (c === '"' || c === "'")
|
||||
this.readString(c);
|
||||
c = this.nextChar();
|
||||
}
|
||||
}
|
||||
return new Comment();
|
||||
},
|
||||
|
||||
|
||||
|
@ -969,20 +1016,9 @@
|
|||
// them away in readChildren()). So just returning an empty Comment node
|
||||
// here is sufficient.
|
||||
if (c === "!" || c === "?") {
|
||||
// We're still before the ! or ? that is starting this comment:
|
||||
this.currentChar++;
|
||||
if (this.match("--")) {
|
||||
this.discardTo("-->");
|
||||
} else {
|
||||
var c = this.nextChar();
|
||||
while (c !== ">") {
|
||||
if (c === undefined)
|
||||
return null;
|
||||
if (c === '"' || c === "'")
|
||||
this.readString(c);
|
||||
c = this.nextChar();
|
||||
}
|
||||
}
|
||||
return new Comment();
|
||||
return this.discardNextComment();
|
||||
}
|
||||
|
||||
// If we're reading a closing tag, return null. This means we've reached
|
||||
|
|
|
@ -95,7 +95,7 @@ Readability.prototype = {
|
|||
// All of the regular expressions in use within readability.
|
||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||
REGEXPS: {
|
||||
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
|
||||
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
||||
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||
negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
||||
|
@ -1448,6 +1448,29 @@ Readability.prototype = {
|
|||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Check if a given node has one of its ancestor tag name matching the
|
||||
* provided one.
|
||||
* @param HTMLElement node
|
||||
* @param String tagName
|
||||
* @param Number maxDepth
|
||||
* @return Boolean
|
||||
*/
|
||||
_hasAncestorTag: function(node, tagName, maxDepth) {
|
||||
maxDepth = maxDepth || 3;
|
||||
tagName = tagName.toUpperCase();
|
||||
var depth = 0;
|
||||
while (node.parentNode) {
|
||||
if (depth > maxDepth)
|
||||
return false;
|
||||
if (node.parentNode.tagName === tagName)
|
||||
return true;
|
||||
node = node.parentNode;
|
||||
depth++;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean an element of all tags of type "tag" if they look fishy.
|
||||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||
|
@ -1493,8 +1516,7 @@ Readability.prototype = {
|
|||
var linkDensity = this._getLinkDensity(tagsList[i]);
|
||||
var contentLength = this._getInnerText(tagsList[i]).length;
|
||||
var toRemove = false;
|
||||
|
||||
if (img > p) {
|
||||
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
|
||||
toRemove = true;
|
||||
} else if (li > p && tag !== "ul" && tag !== "ol") {
|
||||
toRemove = true;
|
||||
|
@ -1510,8 +1532,9 @@ Readability.prototype = {
|
|||
toRemove = true;
|
||||
}
|
||||
|
||||
if (toRemove)
|
||||
if (toRemove) {
|
||||
tagsList[i].parentNode.removeChild(tagsList[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
Загрузка…
Ссылка в новой задаче