No bug, update Readability to github tip, rs=me

This commit is contained in:
Margaret Leibovic 2015-04-09 15:11:24 -07:00
parent d5e8a58213
commit ca1497979e
2 changed files with 193 additions and 40 deletions

View File

@ -37,6 +37,39 @@
dump("JSDOMParser error: " + m + "\n"); dump("JSDOMParser error: " + m + "\n");
} }
// XML only defines these and the numeric ones:
var entityTable = {
"lt": "<",
"gt": ">",
"amp": "&",
"quot": '"',
"apos": "'",
};
var reverseEntityTable = {
"<": "&lt;",
">": "&gt;",
"&": "&amp;",
'"': "&quot;",
"'": "&apos;",
};
function encodeHTML(s) {
return s.replace(/[&<>'"]/g, function(x) {
return reverseEntityTable[x];
});
}
function decodeHTML(str) {
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
return entityTable[tag];
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) {
var num = parseInt(hex || numStr, hex ? 16 : 10); // read num
return String.fromCharCode(num);
});
}
// When a style is set in JS, map it to the corresponding CSS attribute // When a style is set in JS, map it to the corresponding CSS attribute
var styleMap = { var styleMap = {
"alignmentBaseline": "alignment-baseline", "alignmentBaseline": "alignment-baseline",
@ -447,7 +480,9 @@
} }
return oldNode; return oldNode;
} }
} },
__JSDOMParser__: true,
}; };
for (var i in nodeTypes) { for (var i in nodeTypes) {
@ -456,7 +491,27 @@
var Attribute = function (name, value) { var Attribute = function (name, value) {
this.name = name; this.name = name;
this.value = value; this._value = value;
};
Attribute.prototype = {
get value() {
return this._value;
},
setValue: function(newValue) {
this._value = newValue;
delete this._decodedValue;
},
setDecodedValue: function(newValue) {
this._value = encodeHTML(newValue);
this._decodedValue = newValue;
},
getDecodedValue: function() {
if (typeof this._decodedValue === "undefined") {
this._decodedValue = (this._value && decodeHTML(this._value)) || "";
}
return this._decodedValue;
},
}; };
var Comment = function () { var Comment = function () {
@ -479,7 +534,27 @@
nodeName: "#text", nodeName: "#text",
nodeType: Node.TEXT_NODE, nodeType: Node.TEXT_NODE,
textContent: "" get textContent() {
if (typeof this._textContent === "undefined") {
this._textContent = decodeHTML(this._innerHTML || "");
}
return this._textContent;
},
get innerHTML() {
if (typeof this._innerHTML === "undefined") {
this._innerHTML = encodeHTML(this._textContent || "");
}
return this._innerHTML;
},
set innerHTML(newHTML) {
this._innerHTML = newHTML;
delete this._textContent;
},
set textContent(newText) {
this._textContent = newText;
delete this._innerHTML;
},
} }
var Document = function () { var Document = function () {
@ -582,13 +657,15 @@
// serialize attribute list // serialize attribute list
for (var j = 0; j < child.attributes.length; j++) { for (var j = 0; j < child.attributes.length; j++) {
var attr = child.attributes[j]; var attr = child.attributes[j];
var quote = (attr.value.indexOf('"') === -1 ? '"' : "'"); // the attribute value will be HTML escaped.
arr.push(" " + attr.name + '=' + quote + attr.value + quote); var val = attr.value;
var quote = (val.indexOf('"') === -1 ? '"' : "'");
arr.push(" " + attr.name + '=' + quote + val + quote);
} }
if (child.localName in voidElems) { if (child.localName in voidElems) {
// if this is a self-closing element, end it here // if this is a self-closing element, end it here
arr.push("/>"); arr.push(">");
} else { } else {
// otherwise, add its children // otherwise, add its children
arr.push(">"); arr.push(">");
@ -596,7 +673,8 @@
arr.push("</" + child.localName + ">"); arr.push("</" + child.localName + ">");
} }
} else { } else {
arr.push(child.textContent); // This is a text node, so asking for innerHTML won't recurse.
arr.push(child.innerHTML);
} }
} }
} }
@ -658,7 +736,7 @@
for (var i = this.attributes.length; --i >= 0;) { for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i]; var attr = this.attributes[i];
if (attr.name === name) if (attr.name === name)
return attr.value; return attr.getDecodedValue();
} }
return undefined; return undefined;
}, },
@ -667,11 +745,11 @@
for (var i = this.attributes.length; --i >= 0;) { for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i]; var attr = this.attributes[i];
if (attr.name === name) { if (attr.name === name) {
attr.value = value; attr.setDecodedValue(value);
return; return;
} }
} }
this.attributes.push(new Attribute(name, value)); this.attributes.push(new Attribute(name, encodeHTML(value)));
}, },
removeAttribute: function (name) { removeAttribute: function (name) {
@ -822,9 +900,6 @@
// Read the attribute value (and consume the matching quote) // Read the attribute value (and consume the matching quote)
var value = this.readString(c); var value = this.readString(c);
if (!value)
return;
node.attributes.push(new Attribute(name, value)); node.attributes.push(new Attribute(name, value));
return; return;
@ -894,7 +969,7 @@
*/ */
match: function (str) { match: function (str) {
var strlen = str.length; var strlen = str.length;
if (this.html.substr(this.currentChar, strlen) === str) { if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) {
this.currentChar += strlen; this.currentChar += strlen;
return true; return true;
} }
@ -954,10 +1029,10 @@
// looking for the same < all the time. // looking for the same < all the time.
this.currentChar--; this.currentChar--;
if (n === -1) { if (n === -1) {
textNode.textContent += this.html.substring(this.currentChar, this.html.length); textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length; this.currentChar = this.html.length;
} else { } else {
textNode.textContent += this.html.substring(this.currentChar, n); textNode.innerHTML += this.html.substring(this.currentChar, n);
this.currentChar = n; this.currentChar = n;
} }
if (!haveTextNode) if (!haveTextNode)
@ -1000,10 +1075,10 @@
var node = new Text(); var node = new Text();
var n = this.html.indexOf("<", this.currentChar); var n = this.html.indexOf("<", this.currentChar);
if (n === -1) { if (n === -1) {
node.textContent = this.html.substring(this.currentChar, this.html.length); node.innerHTML = this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length; this.currentChar = this.html.length;
} else { } else {
node.textContent = this.html.substring(this.currentChar, n); node.innerHTML = this.html.substring(this.currentChar, n);
this.currentChar = n; this.currentChar = n;
} }
return node; return node;

View File

@ -26,8 +26,15 @@
* available at: http://code.google.com/p/arc90labs-readability * available at: http://code.google.com/p/arc90labs-readability
*/ */
var root = this; var root = this;
var Readability = function(uri, doc) {
var ENABLE_LOGGING = false; /**
* Public constructor.
* @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
var Readability = function(uri, doc, options) {
options = options || {};
this._uri = uri; this._uri = uri;
this._doc = doc; this._doc = doc;
@ -35,6 +42,12 @@ var Readability = function(uri, doc) {
this._articleByline = null; this._articleByline = null;
this._articleDir = null; this._articleDir = null;
// Configureable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
// Start with all flags set // Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS | this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES | this.FLAG_WEIGHT_CLASSES |
@ -52,7 +65,7 @@ var Readability = function(uri, doc) {
this._curPageNum = 1; this._curPageNum = 1;
// Control whether log messages are sent to the console // Control whether log messages are sent to the console
if (ENABLE_LOGGING) { if (this._debug) {
function logEl(e) { function logEl(e) {
var rv = e.nodeName + " "; var rv = e.nodeName + " ";
if (e.nodeType == e.TEXT_NODE) { if (e.nodeType == e.TEXT_NODE) {
@ -84,21 +97,24 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2, FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4, FLAG_CLEAN_CONDITIONALLY: 0x4,
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
// The number of top candidates to consider when analysing how // The number of top candidates to consider when analysing how
// tight the competition is among candidates. // tight the competition is among candidates.
N_TOP_CANDIDATES: 5, DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call // The maximum number of pages to loop through before we call
// it quits and just show a link. // it quits and just show a link.
MAX_PAGES: 5, DEFAULT_MAX_PAGES: 5,
// All of the regular expressions in use within readability. // All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops. // Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: { REGEXPS: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i, byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi, replaceFonts: /<(\/?)font[^>]*>/gi,
@ -336,10 +352,25 @@ Readability.prototype = {
}, },
_setNodeTag: function (node, tag) { _setNodeTag: function (node, tag) {
// FIXME this doesn't work on anything but JSDOMParser (ie the node's tag this.log("_setNodeTag", node, tag);
// won't actually be set). if (node.__JSDOMParser__) {
node.localName = tag.toLowerCase(); node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase(); node.tagName = tag.toUpperCase();
return node;
}
var replacement = node.ownerDocument.createElement(tag);
while (node.firstChild) {
replacement.appendChild(node.firstChild);
}
node.parentNode.replaceChild(replacement, node);
if (node.readability)
replacement.readability = node.readability;
for (var i = 0; i < node.attributes.length; i++) {
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
}
return replacement;
}, },
/** /**
@ -469,6 +500,37 @@ Readability.prototype = {
return node && node.nextElementSibling; return node && node.nextElementSibling;
}, },
/**
* Like _getNextNode, but for DOM implementations with no
* firstElementChild/nextElementSibling functionality...
*/
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
function nextSiblingEl(n) {
do {
n = n.nextSibling;
} while (n && n.nodeType !== n.ELEMENT_NODE);
return n;
}
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.children[0]) {
return node.children[0];
}
// Then for siblings...
var next = nextSiblingEl(node);
if (next) {
return next;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
if (node)
next = nextSiblingEl(node);
} while (node && !next);
return node && next;
},
_checkByline: function(node, matchString) { _checkByline: function(node, matchString) {
if (this._articleByline) { if (this._articleByline) {
return false; return false;
@ -494,6 +556,7 @@ Readability.prototype = {
* @return Element * @return Element
**/ **/
_grabArticle: function (page) { _grabArticle: function (page) {
this.log("**** grabArticle ****");
var doc = this._doc; var doc = this._doc;
var isPaging = (page !== null ? true: false); var isPaging = (page !== null ? true: false);
page = page ? page : this._doc.body; page = page ? page : this._doc.body;
@ -548,11 +611,11 @@ Readability.prototype = {
// safely converted into plain P elements to avoid confusing the scoring // safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs. // algorithm with DIVs with are, in practice, paragraphs.
if (this._hasSinglePInsideElement(node)) { if (this._hasSinglePInsideElement(node)) {
var newNode = node.firstElementChild; var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node); node.parentNode.replaceChild(newNode, node);
node = newNode; node = newNode;
} else if (!this._hasChildBlockElement(node)) { } else if (!this._hasChildBlockElement(node)) {
this._setNodeTag(node, "P"); node = this._setNodeTag(node, "P");
elementsToScore.push(node); elementsToScore.push(node);
} else { } else {
// EXPERIMENTAL // EXPERIMENTAL
@ -635,12 +698,12 @@ Readability.prototype = {
this.log('Candidate:', candidate, "with score " + candidateScore); this.log('Candidate:', candidate, "with score " + candidateScore);
for (var t = 0; t < this.N_TOP_CANDIDATES; t++) { for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t]; var aTopCandidate = topCandidates[t];
if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
topCandidates.splice(t, 0, candidate); topCandidates.splice(t, 0, candidate);
if (topCandidates.length > this.N_TOP_CANDIDATES) if (topCandidates.length > this._nbTopCandidates)
topCandidates.pop(); topCandidates.pop();
break; break;
} }
@ -743,7 +806,7 @@ Readability.prototype = {
// Turn it into a div so it doesn't get filtered out later by accident. // Turn it into a div so it doesn't get filtered out later by accident.
this.log("Altering sibling:", sibling, 'to div.'); this.log("Altering sibling:", sibling, 'to div.');
this._setNodeTag(sibling, "DIV"); sibling = this._setNodeTag(sibling, "DIV");
} }
// To ensure a node does not interfere with readability styles, // To ensure a node does not interfere with readability styles,
@ -760,11 +823,11 @@ Readability.prototype = {
} }
} }
if (this.ENABLE_LOGGING) if (this._debug)
this.log("Article content pre-prep: " + articleContent.innerHTML); this.log("Article content pre-prep: " + articleContent.innerHTML);
// So we have all of the content that we need. Now we clean it up for presentation. // So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent); this._prepArticle(articleContent);
if (this.ENABLE_LOGGING) if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML); this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) { if (this._curPageNum === 1) {
@ -787,7 +850,7 @@ Readability.prototype = {
} }
} }
if (this.ENABLE_LOGGING) if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML); this.log("Article content after paging: " + articleContent.innerHTML);
// Now that we've gone through the full algorithm, check to see if // Now that we've gone through the full algorithm, check to see if
@ -900,6 +963,10 @@ Readability.prototype = {
if (scriptNode.parentNode) if (scriptNode.parentNode)
scriptNode.parentNode.removeChild(scriptNode); scriptNode.parentNode.removeChild(scriptNode);
}); });
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
if (noscriptNode.parentNode)
noscriptNode.parentNode.removeChild(noscriptNode);
});
}, },
/** /**
@ -911,7 +978,7 @@ Readability.prototype = {
**/ **/
_hasSinglePInsideElement: function(element) { _hasSinglePInsideElement: function(element) {
// There should be exactly 1 element child which is a P: // There should be exactly 1 element child which is a P:
if (element.children.length != 1 || element.firstElementChild.tagName !== "P") { if (element.children.length != 1 || element.children[0].tagName !== "P") {
return false; return false;
} }
@ -1290,7 +1357,7 @@ Readability.prototype = {
doc.getElementById("readability-content").appendChild(articlePage); doc.getElementById("readability-content").appendChild(articlePage);
if (this._curPageNum > this.MAX_PAGES) { if (this._curPageNum > this._maxPages) {
var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>"; var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
return; return;
@ -1613,6 +1680,17 @@ Readability.prototype = {
* @return void * @return void
**/ **/
parse: function () { parse: function () {
// Avoid parsing too large documents, as per configuration option
if (this._maxElemsToParse > 0) {
var numTags = this._doc.getElementsByTagName("*").length;
if (numTags > this._maxElemsToParse) {
throw new Error("Aborting parsing document; " + numTags + " elements found");
}
}
if (typeof this._doc.documentElement.firstElementChild === "undefined") {
this._getNextNode = this._getNextNodeNoElementProperties;
}
// Remove script tags from the document. // Remove script tags from the document.
this._removeScripts(this._doc); this._removeScripts(this._doc);