No bug, update Readability to github tip, rs=me

This commit is contained in:
Margaret Leibovic 2015-04-09 15:11:24 -07:00
parent d5e8a58213
commit ca1497979e
2 changed files with 193 additions and 40 deletions

View File

@ -37,6 +37,39 @@
dump("JSDOMParser error: " + m + "\n");
}
// XML only defines these and the numeric ones:
var entityTable = {
"lt": "<",
"gt": ">",
"amp": "&",
"quot": '"',
"apos": "'",
};
var reverseEntityTable = {
"<": "&lt;",
">": "&gt;",
"&": "&amp;",
'"': "&quot;",
"'": "&apos;",
};
function encodeHTML(s) {
return s.replace(/[&<>'"]/g, function(x) {
return reverseEntityTable[x];
});
}
function decodeHTML(str) {
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
return entityTable[tag];
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) {
var num = parseInt(hex || numStr, hex ? 16 : 10); // read num
return String.fromCharCode(num);
});
}
// When a style is set in JS, map it to the corresponding CSS attribute
var styleMap = {
"alignmentBaseline": "alignment-baseline",
@ -447,7 +480,9 @@
}
return oldNode;
}
}
},
__JSDOMParser__: true,
};
for (var i in nodeTypes) {
@ -456,7 +491,27 @@
var Attribute = function (name, value) {
this.name = name;
this.value = value;
this._value = value;
};
Attribute.prototype = {
get value() {
return this._value;
},
setValue: function(newValue) {
this._value = newValue;
delete this._decodedValue;
},
setDecodedValue: function(newValue) {
this._value = encodeHTML(newValue);
this._decodedValue = newValue;
},
getDecodedValue: function() {
if (typeof this._decodedValue === "undefined") {
this._decodedValue = (this._value && decodeHTML(this._value)) || "";
}
return this._decodedValue;
},
};
var Comment = function () {
@ -479,7 +534,27 @@
nodeName: "#text",
nodeType: Node.TEXT_NODE,
textContent: ""
get textContent() {
if (typeof this._textContent === "undefined") {
this._textContent = decodeHTML(this._innerHTML || "");
}
return this._textContent;
},
get innerHTML() {
if (typeof this._innerHTML === "undefined") {
this._innerHTML = encodeHTML(this._textContent || "");
}
return this._innerHTML;
},
set innerHTML(newHTML) {
this._innerHTML = newHTML;
delete this._textContent;
},
set textContent(newText) {
this._textContent = newText;
delete this._innerHTML;
},
}
var Document = function () {
@ -582,13 +657,15 @@
// serialize attribute list
for (var j = 0; j < child.attributes.length; j++) {
var attr = child.attributes[j];
var quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
arr.push(" " + attr.name + '=' + quote + attr.value + quote);
// the attribute value will be HTML escaped.
var val = attr.value;
var quote = (val.indexOf('"') === -1 ? '"' : "'");
arr.push(" " + attr.name + '=' + quote + val + quote);
}
if (child.localName in voidElems) {
// if this is a self-closing element, end it here
arr.push("/>");
arr.push(">");
} else {
// otherwise, add its children
arr.push(">");
@ -596,7 +673,8 @@
arr.push("</" + child.localName + ">");
}
} else {
arr.push(child.textContent);
// This is a text node, so asking for innerHTML won't recurse.
arr.push(child.innerHTML);
}
}
}
@ -658,7 +736,7 @@
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name)
return attr.value;
return attr.getDecodedValue();
}
return undefined;
},
@ -667,11 +745,11 @@
for (var i = this.attributes.length; --i >= 0;) {
var attr = this.attributes[i];
if (attr.name === name) {
attr.value = value;
attr.setDecodedValue(value);
return;
}
}
this.attributes.push(new Attribute(name, value));
this.attributes.push(new Attribute(name, encodeHTML(value)));
},
removeAttribute: function (name) {
@ -822,9 +900,6 @@
// Read the attribute value (and consume the matching quote)
var value = this.readString(c);
if (!value)
return;
node.attributes.push(new Attribute(name, value));
return;
@ -894,7 +969,7 @@
*/
match: function (str) {
var strlen = str.length;
if (this.html.substr(this.currentChar, strlen) === str) {
if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) {
this.currentChar += strlen;
return true;
}
@ -954,10 +1029,10 @@
// looking for the same < all the time.
this.currentChar--;
if (n === -1) {
textNode.textContent += this.html.substring(this.currentChar, this.html.length);
textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
textNode.textContent += this.html.substring(this.currentChar, n);
textNode.innerHTML += this.html.substring(this.currentChar, n);
this.currentChar = n;
}
if (!haveTextNode)
@ -990,7 +1065,7 @@
*/
readNode: function () {
var c = this.nextChar();
if (c === undefined)
return null;
@ -1000,10 +1075,10 @@
var node = new Text();
var n = this.html.indexOf("<", this.currentChar);
if (n === -1) {
node.textContent = this.html.substring(this.currentChar, this.html.length);
node.innerHTML = this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
node.textContent = this.html.substring(this.currentChar, n);
node.innerHTML = this.html.substring(this.currentChar, n);
this.currentChar = n;
}
return node;

View File

@ -26,8 +26,15 @@
* available at: http://code.google.com/p/arc90labs-readability
*/
var root = this;
var Readability = function(uri, doc) {
var ENABLE_LOGGING = false;
/**
* Public constructor.
* @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
var Readability = function(uri, doc, options) {
options = options || {};
this._uri = uri;
this._doc = doc;
@ -35,6 +42,12 @@ var Readability = function(uri, doc) {
this._articleByline = null;
this._articleDir = null;
// Configureable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
@ -52,7 +65,7 @@ var Readability = function(uri, doc) {
this._curPageNum = 1;
// Control whether log messages are sent to the console
if (ENABLE_LOGGING) {
if (this._debug) {
function logEl(e) {
var rv = e.nodeName + " ";
if (e.nodeType == e.TEXT_NODE) {
@ -84,21 +97,24 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
// The number of top candidates to consider when analysing how
// tight the competition is among candidates.
N_TOP_CANDIDATES: 5,
DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call
// it quits and just show a link.
MAX_PAGES: 5,
DEFAULT_MAX_PAGES: 5,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
@ -336,10 +352,25 @@ Readability.prototype = {
},
_setNodeTag: function (node, tag) {
// FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
// won't actually be set).
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
this.log("_setNodeTag", node, tag);
if (node.__JSDOMParser__) {
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
return node;
}
var replacement = node.ownerDocument.createElement(tag);
while (node.firstChild) {
replacement.appendChild(node.firstChild);
}
node.parentNode.replaceChild(replacement, node);
if (node.readability)
replacement.readability = node.readability;
for (var i = 0; i < node.attributes.length; i++) {
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
}
return replacement;
},
/**
@ -469,6 +500,37 @@ Readability.prototype = {
return node && node.nextElementSibling;
},
/**
* Like _getNextNode, but for DOM implementations with no
* firstElementChild/nextElementSibling functionality...
*/
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
function nextSiblingEl(n) {
do {
n = n.nextSibling;
} while (n && n.nodeType !== n.ELEMENT_NODE);
return n;
}
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.children[0]) {
return node.children[0];
}
// Then for siblings...
var next = nextSiblingEl(node);
if (next) {
return next;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
if (node)
next = nextSiblingEl(node);
} while (node && !next);
return node && next;
},
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
@ -494,6 +556,7 @@ Readability.prototype = {
* @return Element
**/
_grabArticle: function (page) {
this.log("**** grabArticle ****");
var doc = this._doc;
var isPaging = (page !== null ? true: false);
page = page ? page : this._doc.body;
@ -548,11 +611,11 @@ Readability.prototype = {
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if (this._hasSinglePInsideElement(node)) {
var newNode = node.firstElementChild;
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
} else if (!this._hasChildBlockElement(node)) {
this._setNodeTag(node, "P");
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
@ -635,12 +698,12 @@ Readability.prototype = {
this.log('Candidate:', candidate, "with score " + candidateScore);
for (var t = 0; t < this.N_TOP_CANDIDATES; t++) {
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
topCandidates.splice(t, 0, candidate);
if (topCandidates.length > this.N_TOP_CANDIDATES)
if (topCandidates.length > this._nbTopCandidates)
topCandidates.pop();
break;
}
@ -743,7 +806,7 @@ Readability.prototype = {
// Turn it into a div so it doesn't get filtered out later by accident.
this.log("Altering sibling:", sibling, 'to div.');
this._setNodeTag(sibling, "DIV");
sibling = this._setNodeTag(sibling, "DIV");
}
// To ensure a node does not interfere with readability styles,
@ -760,11 +823,11 @@ Readability.prototype = {
}
}
if (this.ENABLE_LOGGING)
if (this._debug)
this.log("Article content pre-prep: " + articleContent.innerHTML);
// So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent);
if (this.ENABLE_LOGGING)
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) {
@ -787,7 +850,7 @@ Readability.prototype = {
}
}
if (this.ENABLE_LOGGING)
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);
// Now that we've gone through the full algorithm, check to see if
@ -900,6 +963,10 @@ Readability.prototype = {
if (scriptNode.parentNode)
scriptNode.parentNode.removeChild(scriptNode);
});
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
if (noscriptNode.parentNode)
noscriptNode.parentNode.removeChild(noscriptNode);
});
},
/**
@ -911,7 +978,7 @@ Readability.prototype = {
**/
_hasSinglePInsideElement: function(element) {
// There should be exactly 1 element child which is a P:
if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
if (element.children.length != 1 || element.children[0].tagName !== "P") {
return false;
}
@ -1290,7 +1357,7 @@ Readability.prototype = {
doc.getElementById("readability-content").appendChild(articlePage);
if (this._curPageNum > this.MAX_PAGES) {
if (this._curPageNum > this._maxPages) {
var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
return;
@ -1613,6 +1680,17 @@ Readability.prototype = {
* @return void
**/
parse: function () {
// Avoid parsing too large documents, as per configuration option
if (this._maxElemsToParse > 0) {
var numTags = this._doc.getElementsByTagName("*").length;
if (numTags > this._maxElemsToParse) {
throw new Error("Aborting parsing document; " + numTags + " elements found");
}
}
if (typeof this._doc.documentElement.firstElementChild === "undefined") {
this._getNextNode = this._getNextNodeNoElementProperties;
}
// Remove script tags from the document.
this._removeScripts(this._doc);