зеркало из https://github.com/mozilla/gecko-dev.git
No bug: update readability libs to the up-to-date github versions to include significant perf and quality improvements, rs=me
--HG-- extra : rebase_source : 464cf93b5110cc456454bab0b698bc10d32bea49
This commit is contained in:
Родитель
e771ae69ec
Коммит
82c7c6de1e
|
@ -1,3 +1,10 @@
|
|||
/*
|
||||
* DO NOT MODIFY THIS FILE DIRECTLY!
|
||||
*
|
||||
* This is a shared library that is maintained in an external repo:
|
||||
* https://github.com/mozilla/readability
|
||||
*/
|
||||
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
@ -31,7 +38,7 @@
|
|||
}
|
||||
|
||||
// When a style is set in JS, map it to the corresponding CSS attribute
|
||||
let styleMap = {
|
||||
var styleMap = {
|
||||
"alignmentBaseline": "alignment-baseline",
|
||||
"background": "background",
|
||||
"backgroundAttachment": "background-attachment",
|
||||
|
@ -223,7 +230,7 @@
|
|||
};
|
||||
|
||||
// Elements that can be self-closing
|
||||
let voidElems = {
|
||||
var voidElems = {
|
||||
"area": true,
|
||||
"base": true,
|
||||
"br": true,
|
||||
|
@ -239,8 +246,10 @@
|
|||
"source": true,
|
||||
};
|
||||
|
||||
var whitespace = [" ", "\t", "\n", "\r"];
|
||||
|
||||
// See http://www.w3schools.com/dom/dom_nodetype.asp
|
||||
let nodeTypes = {
|
||||
var nodeTypes = {
|
||||
ELEMENT_NODE: 1,
|
||||
ATTRIBUTE_NODE: 2,
|
||||
TEXT_NODE: 3,
|
||||
|
@ -257,14 +266,12 @@
|
|||
|
||||
function getElementsByTagName(tag) {
|
||||
tag = tag.toUpperCase();
|
||||
let elems = [];
|
||||
let allTags = (tag === "*");
|
||||
var elems = [];
|
||||
var allTags = (tag === "*");
|
||||
function getElems(node) {
|
||||
let length = node.childNodes.length;
|
||||
for (let i = 0; i < length; i++) {
|
||||
let child = node.childNodes[i];
|
||||
if (child.nodeType !== 1)
|
||||
continue;
|
||||
var length = node.children.length;
|
||||
for (var i = 0; i < length; i++) {
|
||||
var child = node.children[i];
|
||||
if (allTags || (child.tagName === tag))
|
||||
elems.push(child);
|
||||
getElems(child);
|
||||
|
@ -274,7 +281,7 @@
|
|||
return elems;
|
||||
}
|
||||
|
||||
let Node = function () {};
|
||||
var Node = function () {};
|
||||
|
||||
Node.prototype = {
|
||||
attributes: null,
|
||||
|
@ -283,18 +290,23 @@
|
|||
nodeName: null,
|
||||
parentNode: null,
|
||||
textContent: null,
|
||||
nextSibling: null,
|
||||
previousSibling: null,
|
||||
|
||||
get firstChild() {
|
||||
return this.childNodes[0] || null;
|
||||
},
|
||||
|
||||
get nextSibling() {
|
||||
if (this.parentNode) {
|
||||
let childNodes = this.parentNode.childNodes;
|
||||
return childNodes[childNodes.indexOf(this) + 1] || null;
|
||||
}
|
||||
get firstElementChild() {
|
||||
return this.children[0] || null;
|
||||
},
|
||||
|
||||
return null;
|
||||
get lastChild() {
|
||||
return this.childNodes[this.childNodes.length - 1] || null;
|
||||
},
|
||||
|
||||
get lastElementChild() {
|
||||
return this.children[this.children.length - 1] || null;
|
||||
},
|
||||
|
||||
appendChild: function (child) {
|
||||
|
@ -302,48 +314,152 @@
|
|||
child.parentNode.removeChild(child);
|
||||
}
|
||||
|
||||
var last = this.lastChild;
|
||||
if (last)
|
||||
last.nextSibling = child;
|
||||
child.previousSibling = last;
|
||||
|
||||
if (child.nodeType === Node.ELEMENT_NODE) {
|
||||
child.previousElementSibling = this.children[this.children.length - 1] || null;
|
||||
this.children.push(child);
|
||||
child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
|
||||
}
|
||||
this.childNodes.push(child);
|
||||
child.parentNode = this;
|
||||
},
|
||||
|
||||
removeChild: function (child) {
|
||||
let childNodes = this.childNodes;
|
||||
let childIndex = childNodes.indexOf(child);
|
||||
var childNodes = this.childNodes;
|
||||
var childIndex = childNodes.indexOf(child);
|
||||
if (childIndex === -1) {
|
||||
throw "removeChild: node not found";
|
||||
} else {
|
||||
child.parentNode = null;
|
||||
var prev = child.previousSibling;
|
||||
var next = child.nextSibling;
|
||||
if (prev)
|
||||
prev.nextSibling = next;
|
||||
if (next)
|
||||
next.previousSibling = prev;
|
||||
|
||||
if (child.nodeType === Node.ELEMENT_NODE) {
|
||||
prev = child.previousElementSibling;
|
||||
next = child.nextElementSibling;
|
||||
if (prev)
|
||||
prev.nextElementSibling = next;
|
||||
if (next)
|
||||
next.previousElementSibling = prev;
|
||||
this.children.splice(this.children.indexOf(child), 1);
|
||||
}
|
||||
|
||||
child.previousSibling = child.nextSibling = null;
|
||||
child.previousElementSibling = child.nextElementSibling = null;
|
||||
|
||||
return childNodes.splice(childIndex, 1)[0];
|
||||
}
|
||||
},
|
||||
|
||||
replaceChild: function (newNode, oldNode) {
|
||||
let childNodes = this.childNodes;
|
||||
let childIndex = childNodes.indexOf(oldNode);
|
||||
var childNodes = this.childNodes;
|
||||
var childIndex = childNodes.indexOf(oldNode);
|
||||
if (childIndex === -1) {
|
||||
throw "replaceChild: node not found";
|
||||
} else {
|
||||
// This will take care of updating the new node if it was somewhere else before:
|
||||
if (newNode.parentNode)
|
||||
newNode.parentNode.removeChild(newNode);
|
||||
|
||||
childNodes[childIndex] = newNode;
|
||||
|
||||
// update the new node's sibling properties, and its new siblings' sibling properties
|
||||
newNode.nextSibling = oldNode.nextSibling;
|
||||
newNode.previousSibling = oldNode.previousSibling;
|
||||
if (newNode.nextSibling)
|
||||
newNode.nextSibling.previousSibling = newNode;
|
||||
if (newNode.previousSibling)
|
||||
newNode.previousSibling.nextSibling = newNode;
|
||||
|
||||
newNode.parentNode = this;
|
||||
|
||||
// Now deal with elements before we clear out those values for the old node,
|
||||
// because it can help us take shortcuts here:
|
||||
if (newNode.nodeType === Node.ELEMENT_NODE) {
|
||||
if (oldNode.nodeType === Node.ELEMENT_NODE) {
|
||||
// Both were elements, which makes this easier, we just swap things out:
|
||||
newNode.previousElementSibling = oldNode.previousElementSibling;
|
||||
newNode.nextElementSibling = oldNode.nextElementSibling;
|
||||
if (newNode.previousElementSibling)
|
||||
newNode.previousElementSibling.nextElementSibling = newNode;
|
||||
if (newNode.nextElementSibling)
|
||||
newNode.nextElementSibling.previousElementSibling = newNode;
|
||||
this.children[this.children.indexOf(oldNode)] = newNode;
|
||||
} else {
|
||||
// Hard way:
|
||||
newNode.previousElementSibling = (function() {
|
||||
for (var i = childIndex - 1; i >= 0; i--) {
|
||||
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
|
||||
return childNodes[i];
|
||||
}
|
||||
return null;
|
||||
})();
|
||||
if (newNode.previousElementSibling) {
|
||||
newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
|
||||
} else {
|
||||
newNode.nextElementSibling = (function() {
|
||||
for (var i = childIndex + 1; i < childNodes.length; i++) {
|
||||
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
|
||||
return childNodes[i];
|
||||
}
|
||||
return null;
|
||||
})();
|
||||
}
|
||||
if (newNode.previousElementSibling)
|
||||
newNode.previousElementSibling.nextElementSibling = newNode;
|
||||
if (newNode.nextElementSibling)
|
||||
newNode.nextElementSibling.previousElementSibling = newNode;
|
||||
|
||||
if (newNode.nextElementSibling)
|
||||
this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
|
||||
else
|
||||
this.children.push(newNode);
|
||||
}
|
||||
} else {
|
||||
// new node is not an element node.
|
||||
// if the old one was, update its element siblings:
|
||||
if (oldNode.nodeType === Node.ELEMENT_NODE) {
|
||||
if (oldNode.previousElementSibling)
|
||||
oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
|
||||
if (oldNode.nextElementSibling)
|
||||
oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
|
||||
this.children.splice(this.children.indexOf(oldNode), 1);
|
||||
}
|
||||
// If the old node wasn't an element, neither the new nor the old node was an element,
|
||||
// and the children array and its members shouldn't need any updating.
|
||||
}
|
||||
|
||||
|
||||
oldNode.parentNode = null;
|
||||
oldNode.previousSibling = null;
|
||||
oldNode.nextSibling = null;
|
||||
if (oldNode.nodeType === Node.ELEMENT_NODE) {
|
||||
oldNode.previousElementSibling = null;
|
||||
oldNode.nextElementSibling = null;
|
||||
}
|
||||
return oldNode;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (let i in nodeTypes) {
|
||||
for (var i in nodeTypes) {
|
||||
Node[i] = Node.prototype[i] = nodeTypes[i];
|
||||
}
|
||||
|
||||
let Attribute = function (name, value) {
|
||||
var Attribute = function (name, value) {
|
||||
this.name = name;
|
||||
this.value = value;
|
||||
};
|
||||
|
||||
let Comment = function () {
|
||||
var Comment = function () {
|
||||
this.childNodes = [];
|
||||
};
|
||||
|
||||
|
@ -354,7 +470,7 @@
|
|||
nodeType: Node.COMMENT_NODE
|
||||
};
|
||||
|
||||
let Text = function () {
|
||||
var Text = function () {
|
||||
this.childNodes = [];
|
||||
};
|
||||
|
||||
|
@ -366,9 +482,10 @@
|
|||
textContent: ""
|
||||
}
|
||||
|
||||
let Document = function () {
|
||||
var Document = function () {
|
||||
this.styleSheets = [];
|
||||
this.childNodes = [];
|
||||
this.children = [];
|
||||
};
|
||||
|
||||
Document.prototype = {
|
||||
|
@ -382,11 +499,11 @@
|
|||
|
||||
getElementById: function (id) {
|
||||
function getElem(node) {
|
||||
let length = node.childNodes.length;
|
||||
var length = node.children.length;
|
||||
if (node.id === id)
|
||||
return node;
|
||||
for (let i = 0; i < length; i++) {
|
||||
let el = getElem(node.childNodes[i]);
|
||||
for (var i = 0; i < length; i++) {
|
||||
var el = getElem(node.children[i]);
|
||||
if (el)
|
||||
return el;
|
||||
}
|
||||
|
@ -396,14 +513,16 @@
|
|||
},
|
||||
|
||||
createElement: function (tag) {
|
||||
let node = new Element(tag);
|
||||
var node = new Element(tag);
|
||||
return node;
|
||||
}
|
||||
};
|
||||
|
||||
let Element = function (tag) {
|
||||
var Element = function (tag) {
|
||||
this.attributes = [];
|
||||
this.childNodes = [];
|
||||
this.children = [];
|
||||
this.nextElementSibling = this.previousElementSibling = null;
|
||||
this.localName = tag.toLowerCase();
|
||||
this.tagName = tag.toUpperCase();
|
||||
this.style = new Style(this);
|
||||
|
@ -454,16 +573,16 @@
|
|||
|
||||
get innerHTML() {
|
||||
function getHTML(node) {
|
||||
let i = 0;
|
||||
var i = 0;
|
||||
for (i = 0; i < node.childNodes.length; i++) {
|
||||
let child = node.childNodes[i];
|
||||
var child = node.childNodes[i];
|
||||
if (child.localName) {
|
||||
arr.push("<" + child.localName);
|
||||
|
||||
// serialize attribute list
|
||||
for (let j = 0; j < child.attributes.length; j++) {
|
||||
let attr = child.attributes[j];
|
||||
let quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
|
||||
for (var j = 0; j < child.attributes.length; j++) {
|
||||
var attr = child.attributes[j];
|
||||
var quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
|
||||
arr.push(" " + attr.name + '=' + quote + attr.value + quote);
|
||||
}
|
||||
|
||||
|
@ -484,30 +603,30 @@
|
|||
|
||||
// Using Array.join() avoids the overhead from lazy string concatenation.
|
||||
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
|
||||
let arr = [];
|
||||
var arr = [];
|
||||
getHTML(this);
|
||||
return arr.join("");
|
||||
},
|
||||
|
||||
set innerHTML(html) {
|
||||
let parser = new JSDOMParser();
|
||||
let node = parser.parse(html);
|
||||
for (let i = this.childNodes.length; --i >= 0;) {
|
||||
var parser = new JSDOMParser();
|
||||
var node = parser.parse(html);
|
||||
for (var i = this.childNodes.length; --i >= 0;) {
|
||||
this.childNodes[i].parentNode = null;
|
||||
}
|
||||
this.childNodes = node.childNodes;
|
||||
for (let i = this.childNodes.length; --i >= 0;) {
|
||||
for (var i = this.childNodes.length; --i >= 0;) {
|
||||
this.childNodes[i].parentNode = this;
|
||||
}
|
||||
},
|
||||
|
||||
set textContent(text) {
|
||||
// clear parentNodes for existing children
|
||||
for (let i = this.childNodes.length; --i >= 0;) {
|
||||
for (var i = this.childNodes.length; --i >= 0;) {
|
||||
this.childNodes[i].parentNode = null;
|
||||
}
|
||||
|
||||
let node = new Text();
|
||||
var node = new Text();
|
||||
this.childNodes = [ node ];
|
||||
node.textContent = text;
|
||||
node.parentNode = this;
|
||||
|
@ -515,9 +634,9 @@
|
|||
|
||||
get textContent() {
|
||||
function getText(node) {
|
||||
let nodes = node.childNodes;
|
||||
for (let i = 0; i < nodes.length; i++) {
|
||||
let child = nodes[i];
|
||||
var nodes = node.childNodes;
|
||||
for (var i = 0; i < nodes.length; i++) {
|
||||
var child = nodes[i];
|
||||
if (child.nodeType === 3) {
|
||||
text.push(child.textContent);
|
||||
} else {
|
||||
|
@ -528,14 +647,14 @@
|
|||
|
||||
// Using Array.join() avoids the overhead from lazy string concatenation.
|
||||
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
|
||||
let text = [];
|
||||
var text = [];
|
||||
getText(this);
|
||||
return text.join("");
|
||||
},
|
||||
|
||||
getAttribute: function (name) {
|
||||
for (let i = this.attributes.length; --i >= 0;) {
|
||||
let attr = this.attributes[i];
|
||||
for (var i = this.attributes.length; --i >= 0;) {
|
||||
var attr = this.attributes[i];
|
||||
if (attr.name === name)
|
||||
return attr.value;
|
||||
}
|
||||
|
@ -543,8 +662,8 @@
|
|||
},
|
||||
|
||||
setAttribute: function (name, value) {
|
||||
for (let i = this.attributes.length; --i >= 0;) {
|
||||
let attr = this.attributes[i];
|
||||
for (var i = this.attributes.length; --i >= 0;) {
|
||||
var attr = this.attributes[i];
|
||||
if (attr.name === name) {
|
||||
attr.value = value;
|
||||
return;
|
||||
|
@ -554,8 +673,8 @@
|
|||
},
|
||||
|
||||
removeAttribute: function (name) {
|
||||
for (let i = this.attributes.length; --i >= 0;) {
|
||||
let attr = this.attributes[i];
|
||||
for (var i = this.attributes.length; --i >= 0;) {
|
||||
var attr = this.attributes[i];
|
||||
if (attr.name === name) {
|
||||
this.attributes.splice(i, 1);
|
||||
break;
|
||||
|
@ -564,7 +683,7 @@
|
|||
}
|
||||
};
|
||||
|
||||
let Style = function (node) {
|
||||
var Style = function (node) {
|
||||
this.node = node;
|
||||
};
|
||||
|
||||
|
@ -575,14 +694,14 @@
|
|||
// manipulations, so this should be okay.
|
||||
Style.prototype = {
|
||||
getStyle: function (styleName) {
|
||||
let attr = this.node.getAttribute("style");
|
||||
var attr = this.node.getAttribute("style");
|
||||
if (!attr)
|
||||
return undefined;
|
||||
|
||||
let styles = attr.split(";");
|
||||
for (let i = 0; i < styles.length; i++) {
|
||||
let style = styles[i].split(":");
|
||||
let name = style[0].trim();
|
||||
var styles = attr.split(";");
|
||||
for (var i = 0; i < styles.length; i++) {
|
||||
var style = styles[i].split(":");
|
||||
var name = style[0].trim();
|
||||
if (name === styleName)
|
||||
return style[1].trim();
|
||||
}
|
||||
|
@ -591,12 +710,12 @@
|
|||
},
|
||||
|
||||
setStyle: function (styleName, styleValue) {
|
||||
let value = this.node.getAttribute("style") || "";
|
||||
let index = 0;
|
||||
var value = this.node.getAttribute("style") || "";
|
||||
var index = 0;
|
||||
do {
|
||||
let next = value.indexOf(";", index) + 1;
|
||||
let length = next - index - 1;
|
||||
let style = (length > 0 ? value.substr(index, length) : value.substr(index));
|
||||
var next = value.indexOf(";", index) + 1;
|
||||
var length = next - index - 1;
|
||||
var style = (length > 0 ? value.substr(index, length) : value.substr(index));
|
||||
if (style.substr(0, style.indexOf(":")).trim() === styleName) {
|
||||
value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : "");
|
||||
break;
|
||||
|
@ -611,7 +730,7 @@
|
|||
|
||||
// For each item in styleMap, define a getter and setter on the style
|
||||
// property.
|
||||
for (let jsName in styleMap) {
|
||||
for (var jsName in styleMap) {
|
||||
(function (cssName) {
|
||||
Style.prototype.__defineGetter__(jsName, function () {
|
||||
return this.getStyle(cssName);
|
||||
|
@ -622,7 +741,7 @@
|
|||
}) (styleMap[jsName]);
|
||||
}
|
||||
|
||||
let JSDOMParser = function () {
|
||||
var JSDOMParser = function () {
|
||||
this.currentChar = 0;
|
||||
|
||||
// In makeElementNode() we build up many strings one char at a time. Using
|
||||
|
@ -659,8 +778,8 @@
|
|||
* character and returns the text string in between.
|
||||
*/
|
||||
readString: function (quote) {
|
||||
let str;
|
||||
let n = this.html.indexOf(quote, this.currentChar);
|
||||
var str;
|
||||
var n = this.html.indexOf(quote, this.currentChar);
|
||||
if (n === -1) {
|
||||
this.currentChar = this.html.length;
|
||||
str = null;
|
||||
|
@ -677,9 +796,9 @@
|
|||
* pair and adds the result to the attributes list.
|
||||
*/
|
||||
readAttribute: function (node) {
|
||||
let name = "";
|
||||
var name = "";
|
||||
|
||||
let n = this.html.indexOf("=", this.currentChar);
|
||||
var n = this.html.indexOf("=", this.currentChar);
|
||||
if (n === -1) {
|
||||
this.currentChar = this.html.length;
|
||||
} else {
|
||||
|
@ -692,14 +811,14 @@
|
|||
return;
|
||||
|
||||
// After a '=', we should see a '"' for the attribute value
|
||||
let c = this.nextChar();
|
||||
var c = this.nextChar();
|
||||
if (c !== '"' && c !== "'") {
|
||||
error("expecting '\"'");
|
||||
error("Error reading attribute " + name + ", expecting '\"'");
|
||||
return;
|
||||
}
|
||||
|
||||
// Read the attribute value (and consume the matching quote)
|
||||
let value = this.readString(c);
|
||||
var value = this.readString(c);
|
||||
|
||||
if (!value)
|
||||
return;
|
||||
|
@ -718,29 +837,30 @@
|
|||
* Element
|
||||
*/
|
||||
makeElementNode: function (retPair) {
|
||||
let c = this.nextChar();
|
||||
var c = this.nextChar();
|
||||
|
||||
// Read the Element tag name
|
||||
let strBuf = this.strBuf;
|
||||
var strBuf = this.strBuf;
|
||||
strBuf.length = 0;
|
||||
while (c !== " " && c !== ">" && c !== "/") {
|
||||
while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") {
|
||||
if (c === undefined)
|
||||
return false;
|
||||
strBuf.push(c);
|
||||
c = this.nextChar();
|
||||
}
|
||||
let tag = strBuf.join('');
|
||||
var tag = strBuf.join('');
|
||||
|
||||
if (!tag)
|
||||
return false;
|
||||
|
||||
let node = new Element(tag);
|
||||
var node = new Element(tag);
|
||||
|
||||
// Read Element attributes
|
||||
while (c !== "/" && c !== ">") {
|
||||
if (c === undefined)
|
||||
return false;
|
||||
while (this.match(" "));
|
||||
while (whitespace.indexOf(this.html[this.currentChar++]) != -1);
|
||||
this.currentChar--;
|
||||
c = this.nextChar();
|
||||
if (c !== "/" && c !== ">") {
|
||||
--this.currentChar;
|
||||
|
@ -749,12 +869,12 @@
|
|||
}
|
||||
|
||||
// If this is a self-closing tag, read '/>'
|
||||
let closed = tag in voidElems;
|
||||
var closed = tag in voidElems;
|
||||
if (c === "/") {
|
||||
closed = true;
|
||||
c = this.nextChar();
|
||||
if (c !== ">") {
|
||||
error("expected '>'");
|
||||
error("expected '>' to close " + tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -771,7 +891,7 @@
|
|||
* @returns whether input matched string
|
||||
*/
|
||||
match: function (str) {
|
||||
let strlen = str.length;
|
||||
var strlen = str.length;
|
||||
if (this.html.substr(this.currentChar, strlen) === str) {
|
||||
this.currentChar += strlen;
|
||||
return true;
|
||||
|
@ -784,7 +904,7 @@
|
|||
* and including the matched string.
|
||||
*/
|
||||
discardTo: function (str) {
|
||||
let index = this.html.indexOf(str, this.currentChar) + str.length;
|
||||
var index = this.html.indexOf(str, this.currentChar) + str.length;
|
||||
if (index === -1)
|
||||
this.currentChar = this.html.length;
|
||||
this.currentChar = index;
|
||||
|
@ -794,16 +914,27 @@
|
|||
* Reads child nodes for the given node.
|
||||
*/
|
||||
readChildren: function (node) {
|
||||
let child;
|
||||
var child;
|
||||
while ((child = this.readNode())) {
|
||||
// Don't keep Comment nodes
|
||||
if (child.nodeType !== 8) {
|
||||
node.childNodes.push(child);
|
||||
child.parentNode = node;
|
||||
node.appendChild(child);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
readScript: function (node) {
|
||||
var index = this.html.indexOf("</script>", this.currentChar);
|
||||
if (index === -1) {
|
||||
index = this.html.length;
|
||||
}
|
||||
var txt = new Text();
|
||||
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
|
||||
node.appendChild(txt);
|
||||
this.currentChar = index;
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* Reads the next child node from the input. If we're reading a closing
|
||||
* tag, or if we've reached the end of input, return null.
|
||||
|
@ -811,7 +942,7 @@
|
|||
* @returns the node
|
||||
*/
|
||||
readNode: function () {
|
||||
let c = this.nextChar();
|
||||
var c = this.nextChar();
|
||||
|
||||
if (c === undefined)
|
||||
return null;
|
||||
|
@ -819,8 +950,8 @@
|
|||
// Read any text as Text node
|
||||
if (c !== "<") {
|
||||
--this.currentChar;
|
||||
let node = new Text();
|
||||
let n = this.html.indexOf("<", this.currentChar);
|
||||
var node = new Text();
|
||||
var n = this.html.indexOf("<", this.currentChar);
|
||||
if (n === -1) {
|
||||
node.textContent = this.html.substring(this.currentChar, this.html.length);
|
||||
this.currentChar = this.html.length;
|
||||
|
@ -842,7 +973,7 @@
|
|||
if (this.match("--")) {
|
||||
this.discardTo("-->");
|
||||
} else {
|
||||
let c = this.nextChar();
|
||||
var c = this.nextChar();
|
||||
while (c !== ">") {
|
||||
if (c === undefined)
|
||||
return null;
|
||||
|
@ -862,25 +993,32 @@
|
|||
}
|
||||
|
||||
// Otherwise, we're looking at an Element node
|
||||
let result = this.makeElementNode(this.retPair);
|
||||
var result = this.makeElementNode(this.retPair);
|
||||
if (!result)
|
||||
return null;
|
||||
|
||||
let node = this.retPair[0];
|
||||
let closed = this.retPair[1];
|
||||
let localName = node.localName;
|
||||
var node = this.retPair[0];
|
||||
var closed = this.retPair[1];
|
||||
var localName = node.localName;
|
||||
|
||||
// If this isn't a void Element, read its child nodes
|
||||
if (!closed) {
|
||||
this.readChildren(node);
|
||||
let closingTag = "</" + localName + ">";
|
||||
if (localName == "script") {
|
||||
this.readScript(node);
|
||||
} else {
|
||||
this.readChildren(node);
|
||||
}
|
||||
var closingTag = "</" + localName + ">";
|
||||
if (!this.match(closingTag)) {
|
||||
error("expected '" + closingTag + "'");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (localName === "title") {
|
||||
// Only use the first title, because SVG might have other
|
||||
// title elements which we don't care about (medium.com
|
||||
// does this, at least).
|
||||
if (localName === "title" && !this.doc.title) {
|
||||
this.doc.title = node.textContent.trim();
|
||||
} else if (localName === "head") {
|
||||
this.doc.head = node;
|
||||
|
@ -898,14 +1036,14 @@
|
|||
*/
|
||||
parse: function (html) {
|
||||
this.html = html;
|
||||
let doc = this.doc = new Document();
|
||||
var doc = this.doc = new Document();
|
||||
this.readChildren(doc);
|
||||
|
||||
// If this is an HTML document, remove root-level children except for the
|
||||
// <html> node
|
||||
if (doc.documentElement) {
|
||||
for (let i = doc.childNodes.length; --i >= 0;) {
|
||||
let child = doc.childNodes[i];
|
||||
for (var i = doc.childNodes.length; --i >= 0;) {
|
||||
var child = doc.childNodes[i];
|
||||
if (child !== doc.documentElement) {
|
||||
doc.removeChild(child);
|
||||
}
|
||||
|
|
|
@ -102,16 +102,18 @@ Readability.prototype = {
|
|||
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
|
||||
byline: /byline|author|dateline|writtenby/i,
|
||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||
trim: /^\s+|\s+$/g,
|
||||
normalize: /\s{2,}/g,
|
||||
videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
|
||||
videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
|
||||
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||
whitespace: /^\s*$/
|
||||
whitespace: /^\s*$/,
|
||||
hasContent: /\S$/,
|
||||
},
|
||||
|
||||
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
|
||||
|
||||
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
|
||||
|
||||
/**
|
||||
* Run any post-process modifications to article content as necessary.
|
||||
*
|
||||
|
@ -204,7 +206,7 @@ Readability.prototype = {
|
|||
curTitle = this._getInnerText(hOnes[0]);
|
||||
}
|
||||
|
||||
curTitle = curTitle.replace(this.REGEXPS.trim, "");
|
||||
curTitle = curTitle.trim();
|
||||
|
||||
if (curTitle.split(' ').length <= 4)
|
||||
curTitle = origTitle;
|
||||
|
@ -223,8 +225,8 @@ Readability.prototype = {
|
|||
|
||||
// Remove all style tags in head
|
||||
var styleTags = doc.getElementsByTagName("style");
|
||||
for (var st = 0; st < styleTags.length; st += 1) {
|
||||
styleTags[st].textContent = "";
|
||||
for (var st = styleTags.length - 1; st >= 0; st -= 1) {
|
||||
styleTags[st].parentNode.removeChild(styleTags[st]);
|
||||
}
|
||||
|
||||
if (doc.body) {
|
||||
|
@ -305,6 +307,8 @@ Readability.prototype = {
|
|||
},
|
||||
|
||||
_setNodeTag: function (node, tag) {
|
||||
// FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
|
||||
// won't actually be set).
|
||||
node.localName = tag.toLowerCase();
|
||||
node.tagName = tag.toUpperCase();
|
||||
},
|
||||
|
@ -407,6 +411,54 @@ Readability.prototype = {
|
|||
node.readability.contentScore += this._getClassWeight(node);
|
||||
},
|
||||
|
||||
_removeAndGetNext: function(node) {
|
||||
var nextNode = this._getNextNode(node, true);
|
||||
node.parentNode.removeChild(node);
|
||||
return nextNode;
|
||||
},
|
||||
|
||||
/**
|
||||
* Traverse the DOM from node to node, starting at the node passed in.
|
||||
* Pass true for the second parameter to indicate this node itself
|
||||
* (and its kids) are going away, and we want the next node over.
|
||||
*
|
||||
* Calling this in a loop will traverse the DOM depth-first.
|
||||
*/
|
||||
_getNextNode: function(node, ignoreSelfAndKids) {
|
||||
// First check for kids if those aren't being ignored
|
||||
if (!ignoreSelfAndKids && node.firstElementChild) {
|
||||
return node.firstElementChild;
|
||||
}
|
||||
// Then for siblings...
|
||||
if (node.nextElementSibling) {
|
||||
return node.nextElementSibling;
|
||||
}
|
||||
// And finally, move up the parent chain *and* find a sibling
|
||||
// (because this is depth-first traversal, we will have already
|
||||
// seen the parent nodes themselves).
|
||||
do {
|
||||
node = node.parentNode;
|
||||
} while (node && !node.nextElementSibling);
|
||||
return node && node.nextElementSibling;
|
||||
},
|
||||
|
||||
_checkByline: function(node, matchString) {
|
||||
if (this._articleByline) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node.getAttribute !== undefined) {
|
||||
var rel = node.getAttribute("rel");
|
||||
}
|
||||
|
||||
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
||||
this._articleByline = node.textContent.trim();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
},
|
||||
|
||||
/***
|
||||
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
||||
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
||||
|
@ -430,65 +482,37 @@ Readability.prototype = {
|
|||
// Check if any "dir" is set on the toplevel document element
|
||||
this._articleDir = doc.documentElement.getAttribute("dir");
|
||||
|
||||
//helper function used below in the 'while' loop:
|
||||
function purgeNode(node, allElements) {
|
||||
for (var i = node.childNodes.length; --i >= 0;) {
|
||||
purgeNode(node.childNodes[i], allElements);
|
||||
}
|
||||
if (node._index !== undefined && allElements[node._index] == node)
|
||||
delete allElements[node._index];
|
||||
}
|
||||
while (true) {
|
||||
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||
var allElements = page.getElementsByTagName('*');
|
||||
|
||||
// First, node prepping. Trash nodes that look cruddy (like ones with the
|
||||
// class name "comment", etc), and turn divs into P tags where they have been
|
||||
// used inappropriately (as in, where they contain no other block level elements.)
|
||||
//
|
||||
// Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
|
||||
// TODO: Shouldn't this be a reverse traversal?
|
||||
var node = null;
|
||||
var nodesToScore = [];
|
||||
var elementsToScore = [];
|
||||
var node = this._doc.documentElement;
|
||||
|
||||
// var each node know its index in the allElements array.
|
||||
for (var i = allElements.length; --i >= 0;) {
|
||||
allElements[i]._index = i;
|
||||
}
|
||||
while (node) {
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
/**
|
||||
* JSDOMParser returns static node lists, not live ones. When we remove
|
||||
* an element from the document, we need to manually remove it - and all
|
||||
* of its children - from the allElements array.
|
||||
*/
|
||||
for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) {
|
||||
if (!(node = allElements[nodeIndex]))
|
||||
// Check to see if this node is a byline, and remove it if it is.
|
||||
if (this._checkByline(node, matchString)) {
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
|
||||
var matchString = node.className + node.id;
|
||||
if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) {
|
||||
if (this._isValidByline(node.textContent)) {
|
||||
this._articleByline = node.textContent.trim();
|
||||
node.parentNode.removeChild(node);
|
||||
purgeNode(node, allElements);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove unlikely candidates
|
||||
if (stripUnlikelyCandidates) {
|
||||
if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 &&
|
||||
matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 &&
|
||||
node.tagName !== "BODY") {
|
||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
||||
node.tagName !== "BODY") {
|
||||
this.log("Removing unlikely candidate - " + matchString);
|
||||
node.parentNode.removeChild(node);
|
||||
purgeNode(node, allElements);
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
|
||||
nodesToScore[nodesToScore.length] = node;
|
||||
elementsToScore.push(node);
|
||||
|
||||
// Turn all divs that don't have children block level elements into p's
|
||||
if (node.tagName === "DIV") {
|
||||
|
@ -496,34 +520,28 @@ Readability.prototype = {
|
|||
// element. DIVs with only a P element inside and no text content can be
|
||||
// safely converted into plain P elements to avoid confusing the scoring
|
||||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
var pIndex = this._getSinglePIndexInsideDiv(node);
|
||||
|
||||
if (pIndex >= 0 || !this._hasChildBlockElement(node)) {
|
||||
if (pIndex >= 0) {
|
||||
var newNode = node.childNodes[pIndex];
|
||||
node.parentNode.replaceChild(newNode, node);
|
||||
purgeNode(node, allElements);
|
||||
} else {
|
||||
this._setNodeTag(node, "P");
|
||||
nodesToScore[nodesToScore.length] = node;
|
||||
}
|
||||
if (this._hasSinglePInsideElement(node)) {
|
||||
var newNode = node.firstElementChild;
|
||||
node.parentNode.replaceChild(newNode, node);
|
||||
node = newNode;
|
||||
} else if (!this._hasChildBlockElement(node)) {
|
||||
this._setNodeTag(node, "P");
|
||||
elementsToScore.push(node);
|
||||
} else {
|
||||
// EXPERIMENTAL
|
||||
for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
|
||||
var childNode = node.childNodes[i];
|
||||
if (!childNode)
|
||||
continue;
|
||||
|
||||
if (childNode.nodeType === 3) { // Node.TEXT_NODE
|
||||
if (childNode.nodeType === Node.TEXT_NODE) {
|
||||
var p = doc.createElement('p');
|
||||
p.textContent = childNode.textContent;
|
||||
p.style.display = 'inline';
|
||||
p.className = 'readability-styled';
|
||||
childNode.parentNode.replaceChild(p, childNode);
|
||||
node.replaceChild(p, childNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
node = this._getNextNode(node);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -533,10 +551,10 @@ Readability.prototype = {
|
|||
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
|
||||
**/
|
||||
var candidates = [];
|
||||
for (var pt = 0; pt < nodesToScore.length; pt += 1) {
|
||||
var parentNode = nodesToScore[pt].parentNode;
|
||||
for (var pt = 0; pt < elementsToScore.length; pt += 1) {
|
||||
var parentNode = elementsToScore[pt].parentNode;
|
||||
var grandParentNode = parentNode ? parentNode.parentNode : null;
|
||||
var innerText = this._getInnerText(nodesToScore[pt]);
|
||||
var innerText = this._getInnerText(elementsToScore[pt]);
|
||||
|
||||
if (!parentNode || typeof(parentNode.tagName) === 'undefined')
|
||||
continue;
|
||||
|
@ -612,15 +630,40 @@ Readability.prototype = {
|
|||
// Move all of the page's children into topCandidate
|
||||
topCandidate = doc.createElement("DIV");
|
||||
neededToCreateTopCandidate = true;
|
||||
var children = page.childNodes;
|
||||
while (children.length) {
|
||||
this.log("Moving child out:", children[0]);
|
||||
topCandidate.appendChild(children[0]);
|
||||
// Move everything (not just elements, also text nodes etc.) into the container
|
||||
// so we even include text directly in the body:
|
||||
var kids = page.childNodes;
|
||||
while (kids.length) {
|
||||
this.log("Moving child out:", kids[0]);
|
||||
topCandidate.appendChild(kids[0]);
|
||||
}
|
||||
|
||||
page.appendChild(topCandidate);
|
||||
|
||||
this._initializeNode(topCandidate);
|
||||
} else if (topCandidate) {
|
||||
// Because of our bonus system, parents of candidates might have scores
|
||||
// themselves. They get half of the node. There won't be nodes with higher
|
||||
// scores than our topCandidate, but if we see the score going *up* in the first
|
||||
// few steps up the tree, that's a decent sign that there might be more content
|
||||
// lurking in other places that we want to unify in. The sibling stuff
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
var parentOfTopCandidate = topCandidate.parentNode;
|
||||
// The scores shouldn't get too low.
|
||||
var scoreThreshold = topCandidate.readability.contentScore / 3;
|
||||
var lastScore = parentOfTopCandidate.readability.contentScore;
|
||||
while (parentOfTopCandidate && parentOfTopCandidate.readability) {
|
||||
var parentScore = parentOfTopCandidate.readability.contentScore;
|
||||
if (parentScore < scoreThreshold)
|
||||
break;
|
||||
if (parentScore > lastScore) {
|
||||
// Alright! We found a better parent to use.
|
||||
topCandidate = parentOfTopCandidate;
|
||||
break;
|
||||
}
|
||||
parentOfTopCandidate = parentOfTopCandidate.parentNode;
|
||||
}
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content
|
||||
|
@ -631,72 +674,71 @@ Readability.prototype = {
|
|||
articleContent.id = "readability-content";
|
||||
|
||||
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
|
||||
var siblingNodes = topCandidate.parentNode.childNodes;
|
||||
var siblings = topCandidate.parentNode.children;
|
||||
|
||||
for (var s = 0, sl = siblingNodes.length; s < sl; s += 1) {
|
||||
var siblingNode = siblingNodes[s];
|
||||
for (var s = 0, sl = siblings.length; s < sl; s++) {
|
||||
var sibling = siblings[s];
|
||||
var append = false;
|
||||
|
||||
this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : ''));
|
||||
this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
|
||||
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
|
||||
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
|
||||
|
||||
if (siblingNode === topCandidate)
|
||||
if (sibling === topCandidate) {
|
||||
append = true;
|
||||
} else {
|
||||
var contentBonus = 0;
|
||||
|
||||
var contentBonus = 0;
|
||||
// Give a bonus if sibling nodes and top candidates have the example same classname
|
||||
if (sibling.className === topCandidate.className && topCandidate.className !== "")
|
||||
contentBonus += topCandidate.readability.contentScore * 0.2;
|
||||
|
||||
// Give a bonus if sibling nodes and top candidates have the example same classname
|
||||
if (siblingNode.className === topCandidate.className && topCandidate.className !== "")
|
||||
contentBonus += topCandidate.readability.contentScore * 0.2;
|
||||
|
||||
if (typeof siblingNode.readability !== 'undefined' &&
|
||||
(siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
|
||||
append = true;
|
||||
|
||||
if (siblingNode.nodeName === "P") {
|
||||
var linkDensity = this._getLinkDensity(siblingNode);
|
||||
var nodeContent = this._getInnerText(siblingNode);
|
||||
var nodeLength = nodeContent.length;
|
||||
|
||||
if (nodeLength > 80 && linkDensity < 0.25) {
|
||||
append = true;
|
||||
} else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
|
||||
if (sibling.readability &&
|
||||
((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
|
||||
append = true;
|
||||
} else if (sibling.nodeName === "P") {
|
||||
var linkDensity = this._getLinkDensity(sibling);
|
||||
var nodeContent = this._getInnerText(sibling);
|
||||
var nodeLength = nodeContent.length;
|
||||
|
||||
if (nodeLength > 80 && linkDensity < 0.25) {
|
||||
append = true;
|
||||
} else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
|
||||
append = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (append) {
|
||||
this.log("Appending node:", siblingNode);
|
||||
this.log("Appending node:", sibling);
|
||||
|
||||
// siblingNodes is a reference to the childNodes array, and
|
||||
// siblingNode is removed from the array when we call appendChild()
|
||||
// below. As a result, we must revisit this index since the nodes
|
||||
// have been shifted.
|
||||
s -= 1;
|
||||
sl -= 1;
|
||||
|
||||
if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
|
||||
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident. */
|
||||
this.log("Altering siblingNode:", siblingNode, 'to div.');
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
this.log("Altering sibling:", sibling, 'to div.');
|
||||
|
||||
this._setNodeTag(siblingNode, "DIV");
|
||||
this._setNodeTag(sibling, "DIV");
|
||||
}
|
||||
|
||||
// To ensure a node does not interfere with readability styles,
|
||||
// remove its classnames.
|
||||
siblingNode.removeAttribute("class");
|
||||
sibling.removeAttribute("class");
|
||||
|
||||
// Append sibling and subtract from our list because it removes
|
||||
// the node when you append to another node.
|
||||
articleContent.appendChild(siblingNode);
|
||||
articleContent.appendChild(sibling);
|
||||
// siblings is a reference to the children array, and
|
||||
// sibling is removed from the array when we call appendChild().
|
||||
// As a result, we must revisit this index since the nodes
|
||||
// have been shifted.
|
||||
s -= 1;
|
||||
sl -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
this.log("Article content pre-prep: " + articleContent.innerHTML);
|
||||
if (this.ENABLE_LOGGING)
|
||||
this.log("Article content pre-prep: " + articleContent.innerHTML);
|
||||
// So we have all of the content that we need. Now we clean it up for presentation.
|
||||
this._prepArticle(articleContent);
|
||||
this.log("Article content post-prep: " + articleContent.innerHTML);
|
||||
if (this.ENABLE_LOGGING)
|
||||
this.log("Article content post-prep: " + articleContent.innerHTML);
|
||||
|
||||
if (this._curPageNum === 1) {
|
||||
if (neededToCreateTopCandidate) {
|
||||
|
@ -718,7 +760,8 @@ Readability.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
this.log("Article content after paging: " + articleContent.innerHTML);
|
||||
if (this.ENABLE_LOGGING)
|
||||
this.log("Article content after paging: " + articleContent.innerHTML);
|
||||
|
||||
// Now that we've gone through the full algorithm, check to see if
|
||||
// we got any meaningful content. If we didn't, we may need to re-run
|
||||
|
@ -760,19 +803,12 @@ Readability.prototype = {
|
|||
},
|
||||
|
||||
/**
|
||||
* Attempts to get the excerpt from these
|
||||
* sources in the following order:
|
||||
* - meta description tag
|
||||
* - open-graph description
|
||||
* - twitter cards description
|
||||
* - article's first paragraph
|
||||
* If no excerpt is found, an empty string will be
|
||||
* returned.
|
||||
*
|
||||
* @param Element - root element of the processed version page
|
||||
* @return String - excerpt of the article
|
||||
**/
|
||||
_getExcerpt: function(articleContent) {
|
||||
* Attempts to get excerpt and byline metadata for the article.
|
||||
*
|
||||
* @return Object with optional "excerpt" and "byline" properties
|
||||
*/
|
||||
_getArticleMetadata: function() {
|
||||
var metadata = {};
|
||||
var values = {};
|
||||
var metaElements = this._doc.getElementsByTagName("meta");
|
||||
|
||||
|
@ -789,7 +825,12 @@ Readability.prototype = {
|
|||
var elementName = element.getAttribute("name");
|
||||
var elementProperty = element.getAttribute("property");
|
||||
|
||||
var name;
|
||||
if (elementName === "author") {
|
||||
metadata.byline = element.getAttribute("content");
|
||||
continue;
|
||||
}
|
||||
|
||||
var name = null;
|
||||
if (namePattern.test(elementName)) {
|
||||
name = elementName;
|
||||
} else if (propertyPattern.test(elementProperty)) {
|
||||
|
@ -808,26 +849,16 @@ Readability.prototype = {
|
|||
}
|
||||
|
||||
if ("description" in values) {
|
||||
return values["description"];
|
||||
}
|
||||
|
||||
if ("og:description" in values) {
|
||||
metadata.excerpt = values["description"];
|
||||
} else if ("og:description" in values) {
|
||||
// Use facebook open graph description.
|
||||
return values["og:description"];
|
||||
}
|
||||
|
||||
if ("twitter:description" in values) {
|
||||
metadata.excerpt = values["og:description"];
|
||||
} else if ("twitter:description" in values) {
|
||||
// Use twitter cards description.
|
||||
return values["twitter:description"];
|
||||
metadata.excerpt = values["twitter:description"];
|
||||
}
|
||||
|
||||
// No description meta tags, use the article's first paragraph.
|
||||
var paragraphs = articleContent.getElementsByTagName("p");
|
||||
if (paragraphs.length > 0) {
|
||||
return paragraphs[0].textContent;
|
||||
}
|
||||
|
||||
return "";
|
||||
return metadata;
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -847,33 +878,28 @@ Readability.prototype = {
|
|||
},
|
||||
|
||||
/**
|
||||
* Get child index of the only P element inside a DIV with no
|
||||
* text content. Returns -1 if the DIV node contains non-empty
|
||||
* text nodes or if it contains other element nodes.
|
||||
* Check if this node has only whitespace and a single P element
|
||||
* Returns false if the DIV node contains non-empty text nodes
|
||||
* or if it contains no P or more than 1 element.
|
||||
*
|
||||
* @param Element
|
||||
**/
|
||||
_getSinglePIndexInsideDiv: function(e) {
|
||||
_hasSinglePInsideElement: function(e) {
|
||||
// There should be exactly 1 element child which is a P:
|
||||
if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
|
||||
return false;
|
||||
}
|
||||
// And there should be no text nodes with real content
|
||||
var childNodes = e.childNodes;
|
||||
var pIndex = -1;
|
||||
|
||||
for (var i = childNodes.length; --i >= 0;) {
|
||||
var node = childNodes[i];
|
||||
|
||||
if (node.nodeType === Node.ELEMENT_NODE) {
|
||||
if (node.tagName !== "P")
|
||||
return -1;
|
||||
|
||||
if (pIndex >= 0)
|
||||
return -1;
|
||||
|
||||
pIndex = i;
|
||||
} else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
|
||||
return -1;
|
||||
if (node.nodeType == Node.TEXT_NODE &&
|
||||
this.REGEXPS.hasContent.test(node.textContent)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return pIndex;
|
||||
return true;
|
||||
},
|
||||
|
||||
/**
|
||||
|
@ -882,12 +908,9 @@ Readability.prototype = {
|
|||
* @param Element
|
||||
*/
|
||||
_hasChildBlockElement: function (e) {
|
||||
var length = e.childNodes.length;
|
||||
var length = e.children.length;
|
||||
for (var i = 0; i < length; i++) {
|
||||
var child = e.childNodes[i];
|
||||
if (child.nodeType != 1)
|
||||
continue;
|
||||
|
||||
var child = e.children[i];
|
||||
if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
|
||||
return true;
|
||||
}
|
||||
|
@ -902,7 +925,7 @@ Readability.prototype = {
|
|||
* @return string
|
||||
**/
|
||||
_getInnerText: function(e, normalizeSpaces) {
|
||||
var textContent = e.textContent.replace(this.REGEXPS.trim, "");
|
||||
var textContent = e.textContent.trim();
|
||||
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
|
||||
|
||||
if (normalizeSpaces) {
|
||||
|
@ -933,10 +956,9 @@ Readability.prototype = {
|
|||
**/
|
||||
_cleanStyles: function(e) {
|
||||
e = e || this._doc;
|
||||
var cur = e.firstChild;
|
||||
|
||||
if (!e)
|
||||
return;
|
||||
var cur = e.firstChild;
|
||||
|
||||
// Remove any root styles, if we're able.
|
||||
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
|
||||
|
@ -944,7 +966,7 @@ Readability.prototype = {
|
|||
|
||||
// Go until there are no more child nodes
|
||||
while (cur !== null) {
|
||||
if (cur.nodeType === 1) {
|
||||
if (cur.nodeType === cur.ELEMENT_NODE) {
|
||||
// Remove style attribute(s) :
|
||||
if (cur.className !== "readability-styled")
|
||||
cur.removeAttribute("style");
|
||||
|
@ -1355,19 +1377,19 @@ Readability.prototype = {
|
|||
|
||||
// Look for a special classname
|
||||
if (typeof(e.className) === 'string' && e.className !== '') {
|
||||
if (e.className.search(this.REGEXPS.negative) !== -1)
|
||||
if (this.REGEXPS.negative.test(e.className))
|
||||
weight -= 25;
|
||||
|
||||
if (e.className.search(this.REGEXPS.positive) !== -1)
|
||||
if (this.REGEXPS.positive.test(e.className))
|
||||
weight += 25;
|
||||
}
|
||||
|
||||
// Look for a special ID
|
||||
if (typeof(e.id) === 'string' && e.id !== '') {
|
||||
if (e.id.search(this.REGEXPS.negative) !== -1)
|
||||
if (this.REGEXPS.negative.test(e.id))
|
||||
weight -= 25;
|
||||
|
||||
if (e.id.search(this.REGEXPS.positive) !== -1)
|
||||
if (this.REGEXPS.positive.test(e.id))
|
||||
weight += 25;
|
||||
}
|
||||
|
||||
|
@ -1395,11 +1417,11 @@ Readability.prototype = {
|
|||
}
|
||||
|
||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||
if (attributeValues.search(this.REGEXPS.videos) !== -1)
|
||||
if (this.REGEXPS.videos.test(attributeValues))
|
||||
continue;
|
||||
|
||||
// Then check the elements inside this element for the same.
|
||||
if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1)
|
||||
if (this.REGEXPS.videos.test(targetList[y].innerHTML))
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1445,7 +1467,7 @@ Readability.prototype = {
|
|||
var embedCount = 0;
|
||||
var embeds = tagsList[i].getElementsByTagName("embed");
|
||||
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
||||
if (embeds[ei].src.search(this.REGEXPS.videos) === -1)
|
||||
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
||||
embedCount += 1;
|
||||
}
|
||||
|
||||
|
@ -1532,6 +1554,8 @@ Readability.prototype = {
|
|||
this._prepDocument();
|
||||
|
||||
var articleTitle = this._getArticleTitle();
|
||||
var metadata = this._getArticleMetadata();
|
||||
|
||||
var articleContent = this._grabArticle();
|
||||
if (!articleContent)
|
||||
return null;
|
||||
|
@ -1548,14 +1572,22 @@ Readability.prototype = {
|
|||
// }).bind(this), 500);
|
||||
// }
|
||||
|
||||
var excerpt = this._getExcerpt(articleContent);
|
||||
// If we haven't found an excerpt in the article's metadata, use the article's
|
||||
// first paragraph as the excerpt. This is used for displaying a preview of
|
||||
// the article's content.
|
||||
if (!metadata.excerpt) {
|
||||
var paragraphs = articleContent.getElementsByTagName("p");
|
||||
if (paragraphs.length > 0) {
|
||||
metadata.excerpt = paragraphs[0].textContent;
|
||||
}
|
||||
}
|
||||
|
||||
return { uri: this._uri,
|
||||
title: articleTitle,
|
||||
byline: this._articleByline,
|
||||
byline: metadata.byline || this._articleByline,
|
||||
dir: this._articleDir,
|
||||
content: articleContent.innerHTML,
|
||||
length: articleContent.textContent.length,
|
||||
excerpt: excerpt };
|
||||
excerpt: metadata.excerpt };
|
||||
}
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче