No bug, update Readability and JSDOMParser to github tip, rs=me

--HG--
extra : rebase_source : dbfc50e5de39ea8ac7190d48af10121a91d77411
This commit is contained in:
Gijs Kruitbosch 2015-04-02 21:48:31 +01:00
Родитель b0e85afe73
Коммит 352a7c0220
2 изменённых файлов: 83 добавлений и 24 удалений

Просмотреть файл

@ -615,6 +615,7 @@
this.childNodes[i].parentNode = null;
}
this.childNodes = node.childNodes;
this.children = node.children;
for (var i = this.childNodes.length; --i >= 0;) {
this.childNodes[i].parentNode = this;
}
@ -628,6 +629,7 @@
var node = new Text();
this.childNodes = [ node ];
this.children = [];
node.textContent = text;
node.parentNode = this;
},
@ -924,14 +926,59 @@
},
readScript: function (node) {
var index = this.html.indexOf("</script>", this.currentChar);
if (index === -1) {
index = this.html.length;
while (this.currentChar < this.html.length) {
var c = this.nextChar();
var nextC = this.peekNext();
if (c === "<") {
if (nextC === "!" || nextC === "?") {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
node.appendChild(this.discardNextComment());
continue;
}
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
// Go back before the '<' so we find the end tag.
this.currentChar--;
// Done with this script tag, the caller will close:
return;
}
}
// Either c wasn't a '<' or it was but we couldn't find either a comment
// or a closing script tag, so we should just parse as text until the next one
// comes along:
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
var textNode = haveTextNode ? node.lastChild : new Text();
var n = this.html.indexOf("<", this.currentChar);
// Decrement this to include the current character *afterwards* so we don't get stuck
// looking for the same < all the time.
this.currentChar--;
if (n === -1) {
textNode.textContent += this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
textNode.textContent += this.html.substring(this.currentChar, n);
this.currentChar = n;
}
if (!haveTextNode)
node.appendChild(textNode);
}
var txt = new Text();
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
node.appendChild(txt);
this.currentChar = index;
},
discardNextComment: function() {
if (this.match("--")) {
this.discardTo("-->");
} else {
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
if (c === '"' || c === "'")
this.readString(c);
c = this.nextChar();
}
}
return new Comment();
},
@ -969,20 +1016,9 @@
// them away in readChildren()). So just returning an empty Comment node
// here is sufficient.
if (c === "!" || c === "?") {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
if (this.match("--")) {
this.discardTo("-->");
} else {
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
if (c === '"' || c === "'")
this.readString(c);
c = this.nextChar();
}
}
return new Comment();
return this.discardNextComment();
}
// If we're reading a closing tag, return null. This means we've reached

Просмотреть файл

@ -95,7 +95,7 @@ Readability.prototype = {
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
@ -1448,6 +1448,29 @@ Readability.prototype = {
});
},
/**
* Check if a given node has one of its ancestor tag name matching the
* provided one.
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
* @return Boolean
*/
_hasAncestorTag: function(node, tagName, maxDepth) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
if (depth > maxDepth)
return false;
if (node.parentNode.tagName === tagName)
return true;
node = node.parentNode;
depth++;
}
return false;
},
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@ -1493,8 +1516,7 @@ Readability.prototype = {
var linkDensity = this._getLinkDensity(tagsList[i]);
var contentLength = this._getInnerText(tagsList[i]).length;
var toRemove = false;
if (img > p) {
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
} else if (li > p && tag !== "ul" && tag !== "ol") {
toRemove = true;
@ -1510,8 +1532,9 @@ Readability.prototype = {
toRemove = true;
}
if (toRemove)
if (toRemove) {
tagsList[i].parentNode.removeChild(tagsList[i]);
}
}
}
},