mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-01 17:23:59 +00:00
Bug 1416561 - update readability from git ( c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5 ), r=already-reviewed
MozReview-Commit-ID: 3WX99CsZ4z5 --HG-- extra : rebase_source : 9b87a522b8ecf039a15b2da58af329f8b0eba923
This commit is contained in:
parent
9d834c457f
commit
ec42d7e872
@ -38,7 +38,6 @@ function Readability(uri, doc, options) {
|
||||
|
||||
this._uri = uri;
|
||||
this._doc = doc;
|
||||
this._biggestFrame = false;
|
||||
this._articleTitle = null;
|
||||
this._articleByline = null;
|
||||
this._articleDir = null;
|
||||
@ -47,24 +46,13 @@ function Readability(uri, doc, options) {
|
||||
this._debug = !!options.debug;
|
||||
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
|
||||
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
|
||||
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
|
||||
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
|
||||
|
||||
// Start with all flags set
|
||||
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
||||
this.FLAG_WEIGHT_CLASSES |
|
||||
this.FLAG_CLEAN_CONDITIONALLY;
|
||||
|
||||
// The list of pages we've parsed in this call of readability,
|
||||
// for autopaging. As a key store for easier searching.
|
||||
this._parsedPages = {};
|
||||
|
||||
// A list of the ETag headers of pages we've parsed, in case they happen to match,
|
||||
// we'll know it's a duplicate.
|
||||
this._pageETags = {};
|
||||
|
||||
// Make an AJAX request for each page and append it to the document.
|
||||
this._curPageNum = 1;
|
||||
|
||||
var logEl;
|
||||
|
||||
// Control whether log messages are sent to the console
|
||||
@ -110,13 +98,12 @@ Readability.prototype = {
|
||||
// tight the competition is among candidates.
|
||||
DEFAULT_N_TOP_CANDIDATES: 5,
|
||||
|
||||
// The maximum number of pages to loop through before we call
|
||||
// it quits and just show a link.
|
||||
DEFAULT_MAX_PAGES: 5,
|
||||
|
||||
// Element tags to score by default.
|
||||
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
|
||||
|
||||
// The default number of words an article must have in order to return a result
|
||||
DEFAULT_WORD_THRESHOLD: 500,
|
||||
|
||||
// All of the regular expressions in use within readability.
|
||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||
REGEXPS: {
|
||||
@ -139,6 +126,10 @@ Readability.prototype = {
|
||||
|
||||
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
|
||||
|
||||
PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
|
||||
|
||||
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
|
||||
|
||||
/**
|
||||
* Run any post-process modifications to article content as necessary.
|
||||
*
|
||||
@ -321,11 +312,20 @@ Readability.prototype = {
|
||||
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
|
||||
} catch (e) {/* ignore exceptions setting the title. */}
|
||||
|
||||
if (curTitle.match(/ [\|\-] /)) {
|
||||
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
|
||||
var titleHadHierarchicalSeparators = false;
|
||||
function wordCount(str) {
|
||||
return str.split(/\s+/).length;
|
||||
}
|
||||
|
||||
if (curTitle.split(' ').length < 3)
|
||||
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
|
||||
// If there's a separator in the title, first remove the final part
|
||||
if ((/ [\|\-\\\/>»] /).test(curTitle)) {
|
||||
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
|
||||
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
|
||||
|
||||
// If the resulting title is too short (3 words or fewer), remove
|
||||
// the first part instead:
|
||||
if (wordCount(curTitle) < 3)
|
||||
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
|
||||
} else if (curTitle.indexOf(': ') !== -1) {
|
||||
// Check if we have an heading containing this exact string, so we
|
||||
// could assume it's the full title.
|
||||
@ -342,7 +342,7 @@ Readability.prototype = {
|
||||
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
|
||||
|
||||
// If the title is now too short, try the first colon instead:
|
||||
if (curTitle.split(' ').length < 3)
|
||||
if (wordCount(curTitle) < 3)
|
||||
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
|
||||
}
|
||||
} else if (curTitle.length > 150 || curTitle.length < 15) {
|
||||
@ -353,9 +353,16 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
curTitle = curTitle.trim();
|
||||
|
||||
if (curTitle.split(' ').length <= 4)
|
||||
// If we now have 4 words or fewer as our title, and either no
|
||||
// 'hierarchical' separators (\, /, > or ») were found in the original
|
||||
// title or we decreased the number of words by more than 1 word, use
|
||||
// the original title.
|
||||
var curTitleWordCount = wordCount(curTitle);
|
||||
if (curTitleWordCount <= 4 &&
|
||||
(!titleHadHierarchicalSeparators ||
|
||||
curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
|
||||
curTitle = origTitle;
|
||||
}
|
||||
|
||||
return curTitle;
|
||||
},
|
||||
@ -501,10 +508,16 @@ Readability.prototype = {
|
||||
var h2 = articleContent.getElementsByTagName('h2');
|
||||
if (h2.length === 1) {
|
||||
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
|
||||
if (Math.abs(lengthSimilarRate) < 0.5 &&
|
||||
(lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
|
||||
this._articleTitle.includes(h2[0].textContent))) {
|
||||
this._clean(articleContent, "h2");
|
||||
if (Math.abs(lengthSimilarRate) < 0.5) {
|
||||
var titlesMatch = false;
|
||||
if (lengthSimilarRate > 0) {
|
||||
titlesMatch = h2[0].textContent.includes(this._articleTitle);
|
||||
} else {
|
||||
titlesMatch = this._articleTitle.includes(h2[0].textContent);
|
||||
}
|
||||
if (titlesMatch) {
|
||||
this._clean(articleContent, "h2");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1014,24 +1027,22 @@ Readability.prototype = {
|
||||
if (this._debug)
|
||||
this.log("Article content post-prep: " + articleContent.innerHTML);
|
||||
|
||||
if (this._curPageNum === 1) {
|
||||
if (neededToCreateTopCandidate) {
|
||||
// We already created a fake div thing, and there wouldn't have been any siblings left
|
||||
// for the previous loop, so there's no point trying to create a new div, and then
|
||||
// move all the children over. Just assign IDs and class names here. No need to append
|
||||
// because that already happened anyway.
|
||||
topCandidate.id = "readability-page-1";
|
||||
topCandidate.className = "page";
|
||||
} else {
|
||||
var div = doc.createElement("DIV");
|
||||
div.id = "readability-page-1";
|
||||
div.className = "page";
|
||||
var children = articleContent.childNodes;
|
||||
while (children.length) {
|
||||
div.appendChild(children[0]);
|
||||
}
|
||||
articleContent.appendChild(div);
|
||||
if (neededToCreateTopCandidate) {
|
||||
// We already created a fake div thing, and there wouldn't have been any siblings left
|
||||
// for the previous loop, so there's no point trying to create a new div, and then
|
||||
// move all the children over. Just assign IDs and class names here. No need to append
|
||||
// because that already happened anyway.
|
||||
topCandidate.id = "readability-page-1";
|
||||
topCandidate.className = "page";
|
||||
} else {
|
||||
var div = doc.createElement("DIV");
|
||||
div.id = "readability-page-1";
|
||||
div.className = "page";
|
||||
var children = articleContent.childNodes;
|
||||
while (children.length) {
|
||||
div.appendChild(children[0]);
|
||||
}
|
||||
articleContent.appendChild(div);
|
||||
}
|
||||
|
||||
if (this._debug)
|
||||
@ -1042,7 +1053,7 @@ Readability.prototype = {
|
||||
// grabArticle with different flags set. This gives us a higher likelihood of
|
||||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
if (this._getInnerText(articleContent, true).length < 500) {
|
||||
if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
|
||||
page.innerHTML = pageCacheHtml;
|
||||
|
||||
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
|
||||
@ -1248,26 +1259,25 @@ Readability.prototype = {
|
||||
* @return void
|
||||
**/
|
||||
_cleanStyles: function(e) {
|
||||
e = e || this._doc;
|
||||
if (!e)
|
||||
if (!e || e.tagName.toLowerCase() === 'svg')
|
||||
return;
|
||||
var cur = e.firstChild;
|
||||
|
||||
// Remove any root styles, if we're able.
|
||||
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
|
||||
e.removeAttribute('style');
|
||||
|
||||
// Go until there are no more child nodes
|
||||
while (cur !== null) {
|
||||
if (cur.nodeType === cur.ELEMENT_NODE) {
|
||||
// Remove style attribute(s) :
|
||||
if (cur.className !== "readability-styled")
|
||||
cur.removeAttribute("style");
|
||||
|
||||
this._cleanStyles(cur);
|
||||
if (e.className !== 'readability-styled') {
|
||||
// Remove `style` and deprecated presentational attributes
|
||||
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
|
||||
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
|
||||
}
|
||||
|
||||
cur = cur.nextSibling;
|
||||
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
|
||||
e.removeAttribute('width');
|
||||
e.removeAttribute('height');
|
||||
}
|
||||
}
|
||||
|
||||
var cur = e.firstElementChild;
|
||||
while (cur !== null) {
|
||||
this._cleanStyles(cur);
|
||||
cur = cur.nextElementSibling;
|
||||
}
|
||||
},
|
||||
|
||||
@ -1293,363 +1303,6 @@ Readability.prototype = {
|
||||
return linkLength / textLength;
|
||||
},
|
||||
|
||||
/**
|
||||
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
|
||||
*
|
||||
* @author Dan Lacy
|
||||
* @return string the base url
|
||||
**/
|
||||
_findBaseUrl: function() {
|
||||
var uri = this._uri;
|
||||
var noUrlParams = uri.pathQueryRef.split("?")[0];
|
||||
var urlSlashes = noUrlParams.split("/").reverse();
|
||||
var cleanedSegments = [];
|
||||
var possibleType = "";
|
||||
|
||||
for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
|
||||
var segment = urlSlashes[i];
|
||||
|
||||
// Split off and save anything that looks like a file type.
|
||||
if (segment.indexOf(".") !== -1) {
|
||||
possibleType = segment.split(".")[1];
|
||||
|
||||
// If the type isn't alpha-only, it's probably not actually a file extension.
|
||||
if (!possibleType.match(/[^a-zA-Z]/))
|
||||
segment = segment.split(".")[0];
|
||||
}
|
||||
|
||||
// If our first or second segment has anything looking like a page number, remove it.
|
||||
if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
|
||||
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
|
||||
|
||||
var del = false;
|
||||
|
||||
// If this is purely a number, and it's the first or second segment,
|
||||
// it's probably a page number. Remove it.
|
||||
if (i < 2 && segment.match(/^\d{1,2}$/))
|
||||
del = true;
|
||||
|
||||
// If this is the first segment and it's just "index", remove it.
|
||||
if (i === 0 && segment.toLowerCase() === "index")
|
||||
del = true;
|
||||
|
||||
// If our first or second segment is smaller than 3 characters,
|
||||
// and the first segment was purely alphas, remove it.
|
||||
if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
|
||||
del = true;
|
||||
|
||||
// If it's not marked for deletion, push it to cleanedSegments.
|
||||
if (!del)
|
||||
cleanedSegments.push(segment);
|
||||
}
|
||||
|
||||
// This is our final, cleaned, base article URL.
|
||||
return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
|
||||
},
|
||||
|
||||
/**
|
||||
* Look for any paging links that may occur within the document.
|
||||
*
|
||||
* @param body
|
||||
* @return object (array)
|
||||
**/
|
||||
_findNextPageLink: function(elem) {
|
||||
var uri = this._uri;
|
||||
var possiblePages = {};
|
||||
var allLinks = elem.getElementsByTagName('a');
|
||||
var articleBaseUrl = this._findBaseUrl();
|
||||
|
||||
// Loop through all links, looking for hints that they may be next-page links.
|
||||
// Things like having "page" in their textContent, className or id, or being a child
|
||||
// of a node with a page-y className or id.
|
||||
//
|
||||
// Also possible: levenshtein distance? longest common subsequence?
|
||||
//
|
||||
// After we do that, assign each page a score, and
|
||||
for (var i = 0, il = allLinks.length; i < il; i += 1) {
|
||||
var link = allLinks[i];
|
||||
var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
|
||||
|
||||
// If we've already seen this page, ignore it.
|
||||
if (linkHref === "" ||
|
||||
linkHref === articleBaseUrl ||
|
||||
linkHref === uri.spec ||
|
||||
linkHref in this._parsedPages) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If it's on a different domain, skip it.
|
||||
if (uri.host !== linkHref.split(/\/+/g)[1])
|
||||
continue;
|
||||
|
||||
var linkText = this._getInnerText(link);
|
||||
|
||||
// If the linkText looks like it's not the next page, skip it.
|
||||
if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
|
||||
continue;
|
||||
|
||||
// If the leftovers of the URL after removing the base URL don't contain
|
||||
// any digits, it's certainly not a next page link.
|
||||
var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
|
||||
if (!linkHrefLeftover.match(/\d/))
|
||||
continue;
|
||||
|
||||
if (!(linkHref in possiblePages)) {
|
||||
possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
|
||||
} else {
|
||||
possiblePages[linkHref].linkText += ' | ' + linkText;
|
||||
}
|
||||
|
||||
var linkObj = possiblePages[linkHref];
|
||||
|
||||
// If the articleBaseUrl isn't part of this URL, penalize this link. It could
|
||||
// still be the link, but the odds are lower.
|
||||
// Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
|
||||
if (linkHref.indexOf(articleBaseUrl) !== 0)
|
||||
linkObj.score -= 25;
|
||||
|
||||
var linkData = linkText + ' ' + link.className + ' ' + link.id;
|
||||
if (linkData.match(this.REGEXPS.nextLink))
|
||||
linkObj.score += 50;
|
||||
|
||||
if (linkData.match(/pag(e|ing|inat)/i))
|
||||
linkObj.score += 25;
|
||||
|
||||
if (linkData.match(/(first|last)/i)) {
|
||||
// -65 is enough to negate any bonuses gotten from a > or » in the text,
|
||||
// If we already matched on "next", last is probably fine.
|
||||
// If we didn't, then it's bad. Penalize.
|
||||
if (!linkObj.linkText.match(this.REGEXPS.nextLink))
|
||||
linkObj.score -= 65;
|
||||
}
|
||||
|
||||
if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
|
||||
linkObj.score -= 50;
|
||||
|
||||
if (linkData.match(this.REGEXPS.prevLink))
|
||||
linkObj.score -= 200;
|
||||
|
||||
// If a parentNode contains page or paging or paginat
|
||||
var parentNode = link.parentNode;
|
||||
var positiveNodeMatch = false;
|
||||
var negativeNodeMatch = false;
|
||||
|
||||
while (parentNode) {
|
||||
var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
|
||||
|
||||
if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
|
||||
positiveNodeMatch = true;
|
||||
linkObj.score += 25;
|
||||
}
|
||||
|
||||
if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
|
||||
// If this is just something like "footer", give it a negative.
|
||||
// If it's something like "body-and-footer", leave it be.
|
||||
if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
|
||||
linkObj.score -= 25;
|
||||
negativeNodeMatch = true;
|
||||
}
|
||||
}
|
||||
|
||||
parentNode = parentNode.parentNode;
|
||||
}
|
||||
|
||||
// If the URL looks like it has paging in it, add to the score.
|
||||
// Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
|
||||
if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
|
||||
linkObj.score += 25;
|
||||
|
||||
// If the URL contains negative values, give a slight decrease.
|
||||
if (linkHref.match(this.REGEXPS.extraneous))
|
||||
linkObj.score -= 15;
|
||||
|
||||
/**
|
||||
* Minor punishment to anything that doesn't match our current URL.
|
||||
* NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
|
||||
* Dan, can you show me a counterexample where this is necessary?
|
||||
* if (linkHref.indexOf(window.location.href) !== 0) {
|
||||
* linkObj.score -= 1;
|
||||
* }
|
||||
**/
|
||||
|
||||
// If the link text can be parsed as a number, give it a minor bonus, with a slight
|
||||
// bias towards lower numbered pages. This is so that pages that might not have 'next'
|
||||
// in their text can still get scored, and sorted properly by score.
|
||||
var linkTextAsNumber = parseInt(linkText, 10);
|
||||
if (linkTextAsNumber) {
|
||||
// Punish 1 since we're either already there, or it's probably
|
||||
// before what we want anyways.
|
||||
if (linkTextAsNumber === 1) {
|
||||
linkObj.score -= 10;
|
||||
} else {
|
||||
linkObj.score += Math.max(0, 10 - linkTextAsNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Loop thrugh all of our possible pages from above and find our top
|
||||
// candidate for the next page URL. Require at least a score of 50, which
|
||||
// is a relatively high confidence that this page is the next link.
|
||||
var topPage = null;
|
||||
for (var page in possiblePages) {
|
||||
if (possiblePages.hasOwnProperty(page)) {
|
||||
if (possiblePages[page].score >= 50 &&
|
||||
(!topPage || topPage.score < possiblePages[page].score))
|
||||
topPage = possiblePages[page];
|
||||
}
|
||||
}
|
||||
|
||||
var nextHref = null;
|
||||
if (topPage) {
|
||||
nextHref = topPage.href.replace(/\/$/, '');
|
||||
|
||||
this.log('NEXT PAGE IS ' + nextHref);
|
||||
this._parsedPages[nextHref] = true;
|
||||
}
|
||||
return nextHref;
|
||||
},
|
||||
|
||||
_successfulRequest: function(request) {
|
||||
return (request.status >= 200 && request.status < 300) ||
|
||||
request.status === 304 ||
|
||||
(request.status === 0 && request.responseText);
|
||||
},
|
||||
|
||||
_ajax: function(url, options) {
|
||||
var request = new XMLHttpRequest();
|
||||
|
||||
function respondToReadyState(readyState) {
|
||||
if (request.readyState === 4) {
|
||||
if (this._successfulRequest(request)) {
|
||||
if (options.success)
|
||||
options.success(request);
|
||||
} else if (options.error) {
|
||||
options.error(request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (typeof options === 'undefined')
|
||||
options = {};
|
||||
|
||||
request.onreadystatechange = respondToReadyState;
|
||||
|
||||
request.open('get', url, true);
|
||||
request.setRequestHeader('Accept', 'text/html');
|
||||
|
||||
try {
|
||||
request.send(options.postBody);
|
||||
} catch (e) {
|
||||
if (options.error)
|
||||
options.error();
|
||||
}
|
||||
|
||||
return request;
|
||||
},
|
||||
|
||||
_appendNextPage: function(nextPageLink) {
|
||||
var doc = this._doc;
|
||||
this._curPageNum += 1;
|
||||
|
||||
var articlePage = doc.createElement("DIV");
|
||||
articlePage.id = 'readability-page-' + this._curPageNum;
|
||||
articlePage.className = 'page';
|
||||
articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">§</p>';
|
||||
|
||||
doc.getElementById("readability-content").appendChild(articlePage);
|
||||
|
||||
if (this._curPageNum > this._maxPages) {
|
||||
var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
|
||||
articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
|
||||
return;
|
||||
}
|
||||
|
||||
// Now that we've built the article page DOM element, get the page content
|
||||
// asynchronously and load the cleaned content into the div we created for it.
|
||||
((pageUrl, thisPage) => {
|
||||
this._ajax(pageUrl, {
|
||||
success: function(r) {
|
||||
|
||||
// First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
|
||||
var eTag = r.getResponseHeader('ETag');
|
||||
if (eTag) {
|
||||
if (eTag in this._pageETags) {
|
||||
this.log("Exact duplicate page found via ETag. Aborting.");
|
||||
articlePage.style.display = 'none';
|
||||
return;
|
||||
}
|
||||
this._pageETags[eTag] = 1;
|
||||
}
|
||||
|
||||
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
|
||||
var page = doc.createElement("DIV");
|
||||
|
||||
// Do some preprocessing to our HTML to make it ready for appending.
|
||||
// - Remove any script tags. Swap and reswap newlines with a unicode
|
||||
// character because multiline regex doesn't work in javascript.
|
||||
// - Turn any noscript tags into divs so that we can parse them. This
|
||||
// allows us to find any next page links hidden via javascript.
|
||||
// - Turn all double br's into p's - was handled by prepDocument in the original view.
|
||||
// Maybe in the future abstract out prepDocument to work for both the original document
|
||||
// and AJAX-added pages.
|
||||
var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
|
||||
responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
|
||||
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
|
||||
|
||||
page.innerHTML = responseHtml;
|
||||
this._replaceBrs(page);
|
||||
|
||||
// Reset all flags for the next page, as they will search through it and
|
||||
// disable as necessary at the end of grabArticle.
|
||||
this._flags = 0x1 | 0x2 | 0x4;
|
||||
|
||||
var secondNextPageLink = this._findNextPageLink(page);
|
||||
|
||||
// NOTE: if we end up supporting _appendNextPage(), we'll need to
|
||||
// change this call to be async
|
||||
var content = this._grabArticle(page);
|
||||
|
||||
if (!content) {
|
||||
this.log("No content found in page to append. Aborting.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
|
||||
// Compare it against all of the the previous document's we've gotten. If the previous
|
||||
// document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
|
||||
var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
|
||||
if (firstP && firstP.innerHTML.length > 100) {
|
||||
for (var i = 1; i <= this._curPageNum; i += 1) {
|
||||
var rPage = doc.getElementById('readability-page-' + i);
|
||||
if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
|
||||
this.log('Duplicate of page ' + i + ' - skipping.');
|
||||
articlePage.style.display = 'none';
|
||||
this._parsedPages[pageUrl] = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this._removeScripts(content);
|
||||
|
||||
thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
|
||||
|
||||
// After the page has rendered, post process the content. This delay is necessary because,
|
||||
// in webkit at least, offsetWidth is not set in time to determine image width. We have to
|
||||
// wait a little bit for reflow to finish before we can fix floating images.
|
||||
setTimeout(() => {
|
||||
this._postProcessContent(thisPage);
|
||||
}, 500);
|
||||
|
||||
|
||||
if (secondNextPageLink)
|
||||
this._appendNextPage(secondNextPageLink);
|
||||
}
|
||||
});
|
||||
})(nextPageLink, articlePage);
|
||||
},
|
||||
|
||||
/**
|
||||
* Get an elements class/id weight. Uses regular expressions to tell if this
|
||||
* element looks good or bad.
|
||||
@ -1932,10 +1585,6 @@ Readability.prototype = {
|
||||
return (this._flags & flag) > 0;
|
||||
},
|
||||
|
||||
_addFlag: function(flag) {
|
||||
this._flags = this._flags | flag;
|
||||
},
|
||||
|
||||
_removeFlag: function(flag) {
|
||||
this._flags = this._flags & ~flag;
|
||||
},
|
||||
@ -2026,16 +1675,6 @@ Readability.prototype = {
|
||||
// Remove script tags from the document.
|
||||
this._removeScripts(this._doc);
|
||||
|
||||
// FIXME: Disabled multi-page article support for now as it
|
||||
// needs more work on infrastructure.
|
||||
|
||||
// Make sure this document is added to the list of parsed pages first,
|
||||
// so we don't double up on the first page.
|
||||
// this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
|
||||
|
||||
// Pull out any possible next page link first.
|
||||
// var nextPageLink = this._findNextPageLink(doc.body);
|
||||
|
||||
this._prepDocument();
|
||||
|
||||
var metadata = this._getArticleMetadata();
|
||||
@ -2049,14 +1688,6 @@ Readability.prototype = {
|
||||
|
||||
this._postProcessContent(articleContent);
|
||||
|
||||
// if (nextPageLink) {
|
||||
// // Append any additional pages after a small timeout so that people
|
||||
// // can start reading without having to wait for this to finish processing.
|
||||
// setTimeout((function() {
|
||||
// this._appendNextPage(nextPageLink);
|
||||
// }).bind(this), 500);
|
||||
// }
|
||||
|
||||
// If we haven't found an excerpt in the article's metadata, use the article's
|
||||
// first paragraph as the excerpt. This is used for displaying a preview of
|
||||
// the article's content.
|
||||
|
Loading…
Reference in New Issue
Block a user