зеркало из https://github.com/mozilla/gecko-dev.git
Bug 777966 - Replace regex <br> matching with replaceBrs(). r=lucasr
--HG-- extra : rebase_source : 0cbe7d9788e551ea1412cb014f92cd5b7b038093
This commit is contained in:
Родитель
fcdb4d4898
Коммит
51ae0e7114
|
@ -73,7 +73,6 @@ Readability.prototype = {
|
||||||
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
||||||
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
|
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
|
||||||
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
|
|
||||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||||
trim: /^\s+|\s+$/g,
|
trim: /^\s+|\s+$/g,
|
||||||
normalize: /\s{2,}/g,
|
normalize: /\s{2,}/g,
|
||||||
|
@ -283,11 +282,80 @@ Readability.prototype = {
|
||||||
styleTags[st].textContent = "";
|
styleTags[st].textContent = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Turn all double br's into p's. Note, this is pretty costly as far
|
this._replaceBrs(doc.body);
|
||||||
// as processing goes. Maybe optimize later.
|
|
||||||
doc.body.innerHTML =
|
doc.body.innerHTML = doc.body.innerHTML.replace(this.REGEXPS.replaceFonts, '<$1span>');
|
||||||
doc.body.innerHTML.replace(this.REGEXPS.replaceBrs, '</p><p>').
|
},
|
||||||
replace(this.REGEXPS.replaceFonts, '<$1span>');
|
|
||||||
|
/**
|
||||||
|
* Replaces 2 or more successive <br> elements with a single <p>.
|
||||||
|
* Whitespace between <br> elements are ignored. For example:
|
||||||
|
* <div>foo<br>bar<br> <br><br>abc</div>
|
||||||
|
* will become:
|
||||||
|
* <div>foo<br>bar<p>abc</p></div>
|
||||||
|
*/
|
||||||
|
_replaceBrs: function (elem) {
|
||||||
|
// ignore whitespace between elements
|
||||||
|
let whitespace = /^\s*$/;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the next element, starting from the given node, and ignoring
|
||||||
|
* whitespace in between. If the given node is an element, the same node is
|
||||||
|
* returned.
|
||||||
|
*/
|
||||||
|
function nextElement(node) {
|
||||||
|
let next = node;
|
||||||
|
while (next
|
||||||
|
&& (next.nodeType != Node.ELEMENT_NODE)
|
||||||
|
&& !whitespace.test(next.textContent)) {
|
||||||
|
next = next.nextSibling;
|
||||||
|
}
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
let brs = elem.getElementsByTagName("br");
|
||||||
|
for (let i = 0; i < brs.length; i++) {
|
||||||
|
let br = brs[i];
|
||||||
|
let next = br.nextSibling;
|
||||||
|
|
||||||
|
// Whether 2 or more <br> elements have been found and replaced with a
|
||||||
|
// <p> block.
|
||||||
|
let replaced = false;
|
||||||
|
|
||||||
|
// If we find a <br> chain, remove the <br>s until we hit another element
|
||||||
|
// or non-whitespace. This leaves behind the first <br> in the chain
|
||||||
|
// (which will be replaced with a <p> later).
|
||||||
|
while ((next = nextElement(next)) && (next.tagName == "BR")) {
|
||||||
|
replaced = true;
|
||||||
|
let sibling = next.nextSibling;
|
||||||
|
next.parentNode.removeChild(next);
|
||||||
|
next = sibling;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we removed a <br> chain, replace the remaining <br> with a <p>. Add
|
||||||
|
// all sibling nodes as children of the <p> until we hit another <br>
|
||||||
|
// chain.
|
||||||
|
if (replaced) {
|
||||||
|
let p = this._doc.createElement("p");
|
||||||
|
br.parentNode.replaceChild(p, br);
|
||||||
|
|
||||||
|
next = p.nextSibling;
|
||||||
|
while (next) {
|
||||||
|
// If we've hit another <br><br>, we're done adding children to this <p>.
|
||||||
|
if (next.tagName == "BR") {
|
||||||
|
let nextElem = nextElement(next);
|
||||||
|
if (nextElem && nextElem.tagName == "BR") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, make this node a child of the new <p>.
|
||||||
|
let sibling = next.nextSibling;
|
||||||
|
p.appendChild(next);
|
||||||
|
next = sibling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1109,10 +1177,10 @@ Readability.prototype = {
|
||||||
let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||||
responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||||
responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
|
responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
|
||||||
responseHtml = responseHtml.replace(this.REGEXPS.replaceBrs, '</p><p>');
|
|
||||||
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
|
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
|
||||||
|
|
||||||
page.innerHTML = responseHtml;
|
page.innerHTML = responseHtml;
|
||||||
|
this._replaceBrs(page);
|
||||||
|
|
||||||
// Reset all flags for the next page, as they will search through it and
|
// Reset all flags for the next page, as they will search through it and
|
||||||
// disable as necessary at the end of grabArticle.
|
// disable as necessary at the end of grabArticle.
|
||||||
|
|
Загрузка…
Ссылка в новой задаче