Bug 1416561 - update readability from git ( c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5 ), r=already-reviewed

MozReview-Commit-ID: 3WX99CsZ4z5

--HG--
extra : rebase_source : 9b87a522b8ecf039a15b2da58af329f8b0eba923
This commit is contained in:
Gijs Kruitbosch 2017-11-12 12:47:32 +00:00
parent 9d834c457f
commit ec42d7e872

View File

@ -38,7 +38,6 @@ function Readability(uri, doc, options) {
this._uri = uri;
this._doc = doc;
this._biggestFrame = false;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
@ -47,24 +46,13 @@ function Readability(uri, doc, options) {
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
// The list of pages we've parsed in this call of readability,
// for autopaging. As a key store for easier searching.
this._parsedPages = {};
// A list of the ETag headers of pages we've parsed, in case they happen to match,
// we'll know it's a duplicate.
this._pageETags = {};
// Make an AJAX request for each page and append it to the document.
this._curPageNum = 1;
var logEl;
// Control whether log messages are sent to the console
@ -110,13 +98,12 @@ Readability.prototype = {
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call
// it quits and just show a link.
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// The default number of words an article must have in order to return a result
DEFAULT_WORD_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
@ -139,6 +126,10 @@ Readability.prototype = {
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
/**
* Run any post-process modifications to article content as necessary.
*
@ -321,11 +312,20 @@ Readability.prototype = {
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
} catch (e) {/* ignore exceptions setting the title. */}
if (curTitle.match(/ [\|\-] /)) {
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
var titleHadHierarchicalSeparators = false;
function wordCount(str) {
return str.split(/\s+/).length;
}
if (curTitle.split(' ').length < 3)
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
// If there's a separator in the title, first remove the final part
if ((/ [\|\-\\\/>»] /).test(curTitle)) {
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
// If the resulting title is too short (3 words or fewer), remove
// the first part instead:
if (wordCount(curTitle) < 3)
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
@ -342,7 +342,7 @@ Readability.prototype = {
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
// If the title is now too short, try the first colon instead:
if (curTitle.split(' ').length < 3)
if (wordCount(curTitle) < 3)
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
@ -353,9 +353,16 @@ Readability.prototype = {
}
curTitle = curTitle.trim();
if (curTitle.split(' ').length <= 4)
// If we now have 4 words or fewer as our title, and either no
// 'hierarchical' separators (\, /, > or ») were found in the original
// title or we decreased the number of words by more than 1 word, use
// the original title.
var curTitleWordCount = wordCount(curTitle);
if (curTitleWordCount <= 4 &&
(!titleHadHierarchicalSeparators ||
curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
curTitle = origTitle;
}
return curTitle;
},
@ -501,10 +508,16 @@ Readability.prototype = {
var h2 = articleContent.getElementsByTagName('h2');
if (h2.length === 1) {
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
if (Math.abs(lengthSimilarRate) < 0.5 &&
(lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
this._articleTitle.includes(h2[0].textContent))) {
this._clean(articleContent, "h2");
if (Math.abs(lengthSimilarRate) < 0.5) {
var titlesMatch = false;
if (lengthSimilarRate > 0) {
titlesMatch = h2[0].textContent.includes(this._articleTitle);
} else {
titlesMatch = this._articleTitle.includes(h2[0].textContent);
}
if (titlesMatch) {
this._clean(articleContent, "h2");
}
}
}
@ -1014,24 +1027,22 @@ Readability.prototype = {
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) {
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
}
if (this._debug)
@ -1042,7 +1053,7 @@ Readability.prototype = {
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
if (this._getInnerText(articleContent, true).length < 500) {
if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
@ -1248,26 +1259,25 @@ Readability.prototype = {
* @return void
**/
_cleanStyles: function(e) {
e = e || this._doc;
if (!e)
if (!e || e.tagName.toLowerCase() === 'svg')
return;
var cur = e.firstChild;
// Remove any root styles, if we're able.
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
e.removeAttribute('style');
// Go until there are no more child nodes
while (cur !== null) {
if (cur.nodeType === cur.ELEMENT_NODE) {
// Remove style attribute(s) :
if (cur.className !== "readability-styled")
cur.removeAttribute("style");
this._cleanStyles(cur);
if (e.className !== 'readability-styled') {
// Remove `style` and deprecated presentational attributes
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
}
cur = cur.nextSibling;
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
e.removeAttribute('width');
e.removeAttribute('height');
}
}
var cur = e.firstElementChild;
while (cur !== null) {
this._cleanStyles(cur);
cur = cur.nextElementSibling;
}
},
@ -1293,363 +1303,6 @@ Readability.prototype = {
return linkLength / textLength;
},
/**
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
*
* @author Dan Lacy
* @return string the base url
**/
_findBaseUrl: function() {
var uri = this._uri;
var noUrlParams = uri.pathQueryRef.split("?")[0];
var urlSlashes = noUrlParams.split("/").reverse();
var cleanedSegments = [];
var possibleType = "";
for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
var segment = urlSlashes[i];
// Split off and save anything that looks like a file type.
if (segment.indexOf(".") !== -1) {
possibleType = segment.split(".")[1];
// If the type isn't alpha-only, it's probably not actually a file extension.
if (!possibleType.match(/[^a-zA-Z]/))
segment = segment.split(".")[0];
}
// If our first or second segment has anything looking like a page number, remove it.
if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
var del = false;
// If this is purely a number, and it's the first or second segment,
// it's probably a page number. Remove it.
if (i < 2 && segment.match(/^\d{1,2}$/))
del = true;
// If this is the first segment and it's just "index", remove it.
if (i === 0 && segment.toLowerCase() === "index")
del = true;
// If our first or second segment is smaller than 3 characters,
// and the first segment was purely alphas, remove it.
if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
del = true;
// If it's not marked for deletion, push it to cleanedSegments.
if (!del)
cleanedSegments.push(segment);
}
// This is our final, cleaned, base article URL.
return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
},
/**
* Look for any paging links that may occur within the document.
*
* @param body
* @return object (array)
**/
_findNextPageLink: function(elem) {
var uri = this._uri;
var possiblePages = {};
var allLinks = elem.getElementsByTagName('a');
var articleBaseUrl = this._findBaseUrl();
// Loop through all links, looking for hints that they may be next-page links.
// Things like having "page" in their textContent, className or id, or being a child
// of a node with a page-y className or id.
//
// Also possible: levenshtein distance? longest common subsequence?
//
// After we do that, assign each page a score, and
for (var i = 0, il = allLinks.length; i < il; i += 1) {
var link = allLinks[i];
var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
// If we've already seen this page, ignore it.
if (linkHref === "" ||
linkHref === articleBaseUrl ||
linkHref === uri.spec ||
linkHref in this._parsedPages) {
continue;
}
// If it's on a different domain, skip it.
if (uri.host !== linkHref.split(/\/+/g)[1])
continue;
var linkText = this._getInnerText(link);
// If the linkText looks like it's not the next page, skip it.
if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
continue;
// If the leftovers of the URL after removing the base URL don't contain
// any digits, it's certainly not a next page link.
var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
if (!linkHrefLeftover.match(/\d/))
continue;
if (!(linkHref in possiblePages)) {
possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
} else {
possiblePages[linkHref].linkText += ' | ' + linkText;
}
var linkObj = possiblePages[linkHref];
// If the articleBaseUrl isn't part of this URL, penalize this link. It could
// still be the link, but the odds are lower.
// Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (linkHref.indexOf(articleBaseUrl) !== 0)
linkObj.score -= 25;
var linkData = linkText + ' ' + link.className + ' ' + link.id;
if (linkData.match(this.REGEXPS.nextLink))
linkObj.score += 50;
if (linkData.match(/pag(e|ing|inat)/i))
linkObj.score += 25;
if (linkData.match(/(first|last)/i)) {
// -65 is enough to negate any bonuses gotten from a > or » in the text,
// If we already matched on "next", last is probably fine.
// If we didn't, then it's bad. Penalize.
if (!linkObj.linkText.match(this.REGEXPS.nextLink))
linkObj.score -= 65;
}
if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
linkObj.score -= 50;
if (linkData.match(this.REGEXPS.prevLink))
linkObj.score -= 200;
// If a parentNode contains page or paging or paginat
var parentNode = link.parentNode;
var positiveNodeMatch = false;
var negativeNodeMatch = false;
while (parentNode) {
var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
positiveNodeMatch = true;
linkObj.score += 25;
}
if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
// If this is just something like "footer", give it a negative.
// If it's something like "body-and-footer", leave it be.
if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
linkObj.score -= 25;
negativeNodeMatch = true;
}
}
parentNode = parentNode.parentNode;
}
// If the URL looks like it has paging in it, add to the score.
// Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
linkObj.score += 25;
// If the URL contains negative values, give a slight decrease.
if (linkHref.match(this.REGEXPS.extraneous))
linkObj.score -= 15;
/**
* Minor punishment to anything that doesn't match our current URL.
* NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
* Dan, can you show me a counterexample where this is necessary?
* if (linkHref.indexOf(window.location.href) !== 0) {
* linkObj.score -= 1;
* }
**/
// If the link text can be parsed as a number, give it a minor bonus, with a slight
// bias towards lower numbered pages. This is so that pages that might not have 'next'
// in their text can still get scored, and sorted properly by score.
var linkTextAsNumber = parseInt(linkText, 10);
if (linkTextAsNumber) {
// Punish 1 since we're either already there, or it's probably
// before what we want anyways.
if (linkTextAsNumber === 1) {
linkObj.score -= 10;
} else {
linkObj.score += Math.max(0, 10 - linkTextAsNumber);
}
}
}
// Loop thrugh all of our possible pages from above and find our top
// candidate for the next page URL. Require at least a score of 50, which
// is a relatively high confidence that this page is the next link.
var topPage = null;
for (var page in possiblePages) {
if (possiblePages.hasOwnProperty(page)) {
if (possiblePages[page].score >= 50 &&
(!topPage || topPage.score < possiblePages[page].score))
topPage = possiblePages[page];
}
}
var nextHref = null;
if (topPage) {
nextHref = topPage.href.replace(/\/$/, '');
this.log('NEXT PAGE IS ' + nextHref);
this._parsedPages[nextHref] = true;
}
return nextHref;
},
_successfulRequest: function(request) {
return (request.status >= 200 && request.status < 300) ||
request.status === 304 ||
(request.status === 0 && request.responseText);
},
_ajax: function(url, options) {
var request = new XMLHttpRequest();
function respondToReadyState(readyState) {
if (request.readyState === 4) {
if (this._successfulRequest(request)) {
if (options.success)
options.success(request);
} else if (options.error) {
options.error(request);
}
}
}
if (typeof options === 'undefined')
options = {};
request.onreadystatechange = respondToReadyState;
request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');
try {
request.send(options.postBody);
} catch (e) {
if (options.error)
options.error();
}
return request;
},
_appendNextPage: function(nextPageLink) {
var doc = this._doc;
this._curPageNum += 1;
var articlePage = doc.createElement("DIV");
articlePage.id = 'readability-page-' + this._curPageNum;
articlePage.className = 'page';
articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
doc.getElementById("readability-content").appendChild(articlePage);
if (this._curPageNum > this._maxPages) {
var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
return;
}
// Now that we've built the article page DOM element, get the page content
// asynchronously and load the cleaned content into the div we created for it.
((pageUrl, thisPage) => {
this._ajax(pageUrl, {
success: function(r) {
// First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
var eTag = r.getResponseHeader('ETag');
if (eTag) {
if (eTag in this._pageETags) {
this.log("Exact duplicate page found via ETag. Aborting.");
articlePage.style.display = 'none';
return;
}
this._pageETags[eTag] = 1;
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
var page = doc.createElement("DIV");
// Do some preprocessing to our HTML to make it ready for appending.
// - Remove any script tags. Swap and reswap newlines with a unicode
// character because multiline regex doesn't work in javascript.
// - Turn any noscript tags into divs so that we can parse them. This
// allows us to find any next page links hidden via javascript.
// - Turn all double br's into p's - was handled by prepDocument in the original view.
// Maybe in the future abstract out prepDocument to work for both the original document
// and AJAX-added pages.
var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
page.innerHTML = responseHtml;
this._replaceBrs(page);
// Reset all flags for the next page, as they will search through it and
// disable as necessary at the end of grabArticle.
this._flags = 0x1 | 0x2 | 0x4;
var secondNextPageLink = this._findNextPageLink(page);
// NOTE: if we end up supporting _appendNextPage(), we'll need to
// change this call to be async
var content = this._grabArticle(page);
if (!content) {
this.log("No content found in page to append. Aborting.");
return;
}
// Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
// Compare it against all of the the previous document's we've gotten. If the previous
// document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
if (firstP && firstP.innerHTML.length > 100) {
for (var i = 1; i <= this._curPageNum; i += 1) {
var rPage = doc.getElementById('readability-page-' + i);
if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
this.log('Duplicate of page ' + i + ' - skipping.');
articlePage.style.display = 'none';
this._parsedPages[pageUrl] = true;
return;
}
}
}
this._removeScripts(content);
thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
// After the page has rendered, post process the content. This delay is necessary because,
// in webkit at least, offsetWidth is not set in time to determine image width. We have to
// wait a little bit for reflow to finish before we can fix floating images.
setTimeout(() => {
this._postProcessContent(thisPage);
}, 500);
if (secondNextPageLink)
this._appendNextPage(secondNextPageLink);
}
});
})(nextPageLink, articlePage);
},
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
@ -1932,10 +1585,6 @@ Readability.prototype = {
return (this._flags & flag) > 0;
},
_addFlag: function(flag) {
this._flags = this._flags | flag;
},
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
@ -2026,16 +1675,6 @@ Readability.prototype = {
// Remove script tags from the document.
this._removeScripts(this._doc);
// FIXME: Disabled multi-page article support for now as it
// needs more work on infrastructure.
// Make sure this document is added to the list of parsed pages first,
// so we don't double up on the first page.
// this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
// Pull out any possible next page link first.
// var nextPageLink = this._findNextPageLink(doc.body);
this._prepDocument();
var metadata = this._getArticleMetadata();
@ -2049,14 +1688,6 @@ Readability.prototype = {
this._postProcessContent(articleContent);
// if (nextPageLink) {
// // Append any additional pages after a small timeout so that people
// // can start reading without having to wait for this to finish processing.
// setTimeout((function() {
// this._appendNextPage(nextPageLink);
// }).bind(this), 500);
// }
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.