mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-05 05:30:29 +00:00
No bug: update readability libs to the up-to-date github versions. rs=me+Gijs
--HG-- extra : rebase_source : a3fe5635643ee253f82764f8e54409982978703e
This commit is contained in:
parent
7b11753f7f
commit
faf2258dd6
@ -103,7 +103,7 @@ Readability.prototype = {
|
||||
byline: /byline|author|dateline|writtenby/i,
|
||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||
normalize: /\s{2,}/g,
|
||||
videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
|
||||
videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||
whitespace: /^\s*$/,
|
||||
@ -125,6 +125,36 @@ Readability.prototype = {
|
||||
this._fixRelativeUris(articleContent);
|
||||
},
|
||||
|
||||
/**
|
||||
* Iterate over a NodeList, which doesn't natively fully implement the Array
|
||||
* interface.
|
||||
*
|
||||
* For convenience, the current object context is applied to the provided
|
||||
* iterate function.
|
||||
*
|
||||
* @param NodeList nodeList The NodeList.
|
||||
* @param Function fn The iterate function.
|
||||
* @return void
|
||||
*/
|
||||
_forEachNode: function(nodeList, fn) {
|
||||
return Array.prototype.forEach.call(nodeList, fn, this);
|
||||
},
|
||||
|
||||
/**
|
||||
* Iterate over a NodeList, return true if any of the provided iterate
|
||||
* function calls returns true, false otherwise.
|
||||
*
|
||||
* For convenience, the current object context is applied to the
|
||||
* provided iterate function.
|
||||
*
|
||||
* @param NodeList nodeList The NodeList.
|
||||
* @param Function fn The iterate function.
|
||||
* @return Boolean
|
||||
*/
|
||||
_someNode: function(nodeList, fn) {
|
||||
return Array.prototype.some.call(nodeList, fn, this);
|
||||
},
|
||||
|
||||
/**
|
||||
* Converts each <a> and <img> uri in the given element to an absolute URI.
|
||||
*
|
||||
@ -149,6 +179,10 @@ Readability.prototype = {
|
||||
if (uri[0] == "/")
|
||||
return prePath + uri;
|
||||
|
||||
// Dotslash relative URI.
|
||||
if (uri.indexOf("./") === 0)
|
||||
return pathBase + uri.slice(2);
|
||||
|
||||
// Standard relative URI; add entire path. pathBase already includes a
|
||||
// trailing "/".
|
||||
return pathBase + uri;
|
||||
@ -156,19 +190,18 @@ Readability.prototype = {
|
||||
|
||||
function convertRelativeURIs(tagName, propName) {
|
||||
var elems = articleContent.getElementsByTagName(tagName);
|
||||
for (var i = elems.length; --i >= 0;) {
|
||||
var elem = elems[i];
|
||||
this._forEachNode(elems, function(elem) {
|
||||
var relativeURI = elem.getAttribute(propName);
|
||||
if (relativeURI != null)
|
||||
elems[i].setAttribute(propName, toAbsoluteURI(relativeURI));
|
||||
}
|
||||
elem.setAttribute(propName, toAbsoluteURI(relativeURI));
|
||||
});
|
||||
}
|
||||
|
||||
// Fix links.
|
||||
convertRelativeURIs("a", "href");
|
||||
convertRelativeURIs.call(this, "a", "href");
|
||||
|
||||
// Fix images.
|
||||
convertRelativeURIs("img", "src");
|
||||
convertRelativeURIs.call(this, "img", "src");
|
||||
},
|
||||
|
||||
/**
|
||||
@ -224,19 +257,17 @@ Readability.prototype = {
|
||||
var doc = this._doc;
|
||||
|
||||
// Remove all style tags in head
|
||||
var styleTags = doc.getElementsByTagName("style");
|
||||
for (var st = styleTags.length - 1; st >= 0; st -= 1) {
|
||||
styleTags[st].parentNode.removeChild(styleTags[st]);
|
||||
}
|
||||
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
|
||||
styleNode.parentNode.removeChild(styleNode);
|
||||
});
|
||||
|
||||
if (doc.body) {
|
||||
this._replaceBrs(doc.body);
|
||||
}
|
||||
|
||||
var fonts = doc.getElementsByTagName("FONT");
|
||||
for (var i = fonts.length; --i >=0;) {
|
||||
this._setNodeTag(fonts[i], "SPAN");
|
||||
}
|
||||
this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
|
||||
this._setNodeTag(fontNode, "SPAN");
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -262,9 +293,7 @@ Readability.prototype = {
|
||||
* <div>foo<br>bar<p>abc</p></div>
|
||||
*/
|
||||
_replaceBrs: function (elem) {
|
||||
var brs = elem.getElementsByTagName("br");
|
||||
for (var i = 0; i < brs.length; i++) {
|
||||
var br = brs[i];
|
||||
this._forEachNode(elem.getElementsByTagName("br"), function(br) {
|
||||
var next = br.nextSibling;
|
||||
|
||||
// Whether 2 or more <br> elements have been found and replaced with a
|
||||
@ -303,7 +332,7 @@ Readability.prototype = {
|
||||
next = sibling;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
_setNodeTag: function (node, tag) {
|
||||
@ -326,6 +355,7 @@ Readability.prototype = {
|
||||
// Clean out junk from the article content
|
||||
this._cleanConditionally(articleContent, "form");
|
||||
this._clean(articleContent, "object");
|
||||
this._clean(articleContent, "embed");
|
||||
this._clean(articleContent, "h1");
|
||||
|
||||
// If there is only one h2, they are probably using it as a header
|
||||
@ -343,26 +373,23 @@ Readability.prototype = {
|
||||
this._cleanConditionally(articleContent, "div");
|
||||
|
||||
// Remove extra paragraphs
|
||||
var articleParagraphs = articleContent.getElementsByTagName('p');
|
||||
for (var i = articleParagraphs.length - 1; i >= 0; i -= 1) {
|
||||
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
|
||||
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
|
||||
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
|
||||
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
|
||||
var imgCount = paragraph.getElementsByTagName('img').length;
|
||||
var embedCount = paragraph.getElementsByTagName('embed').length;
|
||||
var objectCount = paragraph.getElementsByTagName('object').length;
|
||||
// At this point, nasty iframes have been removed, only remain embedded video ones.
|
||||
var iframeCount = paragraph.getElementsByTagName('iframe').length;
|
||||
var totalCount = imgCount + embedCount + objectCount + iframeCount;
|
||||
|
||||
if (imgCount === 0 &&
|
||||
embedCount === 0 &&
|
||||
objectCount === 0 &&
|
||||
this._getInnerText(articleParagraphs[i], false) === '')
|
||||
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
|
||||
}
|
||||
if (totalCount === 0 && !this._getInnerText(paragraph, false))
|
||||
paragraph.parentNode.removeChild(paragraph);
|
||||
});
|
||||
|
||||
var brs = articleContent.getElementsByTagName("BR");
|
||||
for (var i = brs.length; --i >= 0;) {
|
||||
var br = brs[i];
|
||||
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
|
||||
var next = this._nextElement(br.nextSibling);
|
||||
if (next && next.tagName == "P")
|
||||
br.parentNode.removeChild(br);
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -529,8 +556,7 @@ Readability.prototype = {
|
||||
elementsToScore.push(node);
|
||||
} else {
|
||||
// EXPERIMENTAL
|
||||
for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
|
||||
var childNode = node.childNodes[i];
|
||||
this._forEachNode(node.childNodes, function(childNode) {
|
||||
if (childNode.nodeType === Node.TEXT_NODE) {
|
||||
var p = doc.createElement('p');
|
||||
p.textContent = childNode.textContent;
|
||||
@ -538,7 +564,7 @@ Readability.prototype = {
|
||||
p.className = 'readability-styled';
|
||||
node.replaceChild(p, childNode);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
node = this._getNextNode(node);
|
||||
@ -551,17 +577,17 @@ Readability.prototype = {
|
||||
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
|
||||
**/
|
||||
var candidates = [];
|
||||
for (var pt = 0; pt < elementsToScore.length; pt += 1) {
|
||||
var parentNode = elementsToScore[pt].parentNode;
|
||||
this._forEachNode(elementsToScore, function(elementToScore) {
|
||||
var parentNode = elementToScore.parentNode;
|
||||
var grandParentNode = parentNode ? parentNode.parentNode : null;
|
||||
var innerText = this._getInnerText(elementsToScore[pt]);
|
||||
var innerText = this._getInnerText(elementToScore);
|
||||
|
||||
if (!parentNode || typeof(parentNode.tagName) === 'undefined')
|
||||
continue;
|
||||
return;
|
||||
|
||||
// If this paragraph is less than 25 characters, don't even count it.
|
||||
if (innerText.length < 25)
|
||||
continue;
|
||||
return;
|
||||
|
||||
// Initialize readability data for the parent.
|
||||
if (typeof parentNode.readability === 'undefined') {
|
||||
@ -593,7 +619,7 @@ Readability.prototype = {
|
||||
|
||||
if (grandParentNode)
|
||||
grandParentNode.readability.contentScore += contentScore / 2;
|
||||
}
|
||||
});
|
||||
|
||||
// After we've calculated scores, loop through all of the possible
|
||||
// candidate nodes we found and find the one with the highest score.
|
||||
@ -650,9 +676,9 @@ Readability.prototype = {
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
var parentOfTopCandidate = topCandidate.parentNode;
|
||||
var lastScore = topCandidate.readability.contentScore;
|
||||
// The scores shouldn't get too low.
|
||||
var scoreThreshold = topCandidate.readability.contentScore / 3;
|
||||
var lastScore = parentOfTopCandidate.readability.contentScore;
|
||||
var scoreThreshold = lastScore / 3;
|
||||
while (parentOfTopCandidate && parentOfTopCandidate.readability) {
|
||||
var parentScore = parentOfTopCandidate.readability.contentScore;
|
||||
if (parentScore < scoreThreshold)
|
||||
@ -662,6 +688,7 @@ Readability.prototype = {
|
||||
topCandidate = parentOfTopCandidate;
|
||||
break;
|
||||
}
|
||||
lastScore = parentOfTopCandidate.readability.contentScore;
|
||||
parentOfTopCandidate = parentOfTopCandidate.parentNode;
|
||||
}
|
||||
}
|
||||
@ -804,7 +831,7 @@ Readability.prototype = {
|
||||
|
||||
/**
|
||||
* Attempts to get excerpt and byline metadata for the article.
|
||||
*
|
||||
*
|
||||
* @return Object with optional "excerpt" and "byline" properties
|
||||
*/
|
||||
_getArticleMetadata: function() {
|
||||
@ -820,14 +847,13 @@ Readability.prototype = {
|
||||
var propertyPattern = /^\s*og\s*:\s*description\s*$/gi;
|
||||
|
||||
// Find description tags.
|
||||
for (var i = 0; i < metaElements.length; i++) {
|
||||
var element = metaElements[i];
|
||||
this._forEachNode(metaElements, function(element) {
|
||||
var elementName = element.getAttribute("name");
|
||||
var elementProperty = element.getAttribute("property");
|
||||
|
||||
if (elementName === "author") {
|
||||
metadata.byline = element.getAttribute("content");
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
var name = null;
|
||||
@ -846,7 +872,7 @@ Readability.prototype = {
|
||||
values[name] = content.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if ("description" in values) {
|
||||
metadata.excerpt = values["description"];
|
||||
@ -867,14 +893,13 @@ Readability.prototype = {
|
||||
* @param Element
|
||||
**/
|
||||
_removeScripts: function(doc) {
|
||||
var scripts = doc.getElementsByTagName('script');
|
||||
for (var i = scripts.length - 1; i >= 0; i -= 1) {
|
||||
scripts[i].nodeValue="";
|
||||
scripts[i].removeAttribute('src');
|
||||
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
|
||||
scriptNode.nodeValue = "";
|
||||
scriptNode.removeAttribute('src');
|
||||
|
||||
if (scripts[i].parentNode)
|
||||
scripts[i].parentNode.removeChild(scripts[i]);
|
||||
}
|
||||
if (scriptNode.parentNode)
|
||||
scriptNode.parentNode.removeChild(scriptNode);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -884,22 +909,17 @@ Readability.prototype = {
|
||||
*
|
||||
* @param Element
|
||||
**/
|
||||
_hasSinglePInsideElement: function(e) {
|
||||
_hasSinglePInsideElement: function(element) {
|
||||
// There should be exactly 1 element child which is a P:
|
||||
if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
|
||||
if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
|
||||
return false;
|
||||
}
|
||||
// And there should be no text nodes with real content
|
||||
var childNodes = e.childNodes;
|
||||
for (var i = childNodes.length; --i >= 0;) {
|
||||
var node = childNodes[i];
|
||||
if (node.nodeType == Node.TEXT_NODE &&
|
||||
this.REGEXPS.hasContent.test(node.textContent)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
// And there should be no text nodes with real content
|
||||
return !this._someNode(element.childNodes, function(node) {
|
||||
return node.nodeType === Node.TEXT_NODE &&
|
||||
this.REGEXPS.hasContent.test(node.textContent);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -907,14 +927,11 @@ Readability.prototype = {
|
||||
*
|
||||
* @param Element
|
||||
*/
|
||||
_hasChildBlockElement: function (e) {
|
||||
var length = e.children.length;
|
||||
for (var i = 0; i < length; i++) {
|
||||
var child = e.children[i];
|
||||
if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
_hasChildBlockElement: function (element) {
|
||||
return this._someNode(element.childNodes, function(node) {
|
||||
return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
|
||||
this._hasChildBlockElement(node);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -922,11 +939,12 @@ Readability.prototype = {
|
||||
* This also strips out any excess whitespace to be found.
|
||||
*
|
||||
* @param Element
|
||||
* @param Boolean normalizeSpaces (default: true)
|
||||
* @return string
|
||||
**/
|
||||
_getInnerText: function(e, normalizeSpaces) {
|
||||
var textContent = e.textContent.trim();
|
||||
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
|
||||
var textContent = e.textContent.trim();
|
||||
|
||||
if (normalizeSpaces) {
|
||||
return textContent.replace(this.REGEXPS.normalize, " ");
|
||||
@ -985,14 +1003,17 @@ Readability.prototype = {
|
||||
* @param Element
|
||||
* @return number (float)
|
||||
**/
|
||||
_getLinkDensity: function(e) {
|
||||
var links = e.getElementsByTagName("a");
|
||||
var textLength = this._getInnerText(e).length;
|
||||
_getLinkDensity: function(element) {
|
||||
var textLength = this._getInnerText(element).length;
|
||||
if (textLength === 0)
|
||||
return;
|
||||
|
||||
var linkLength = 0;
|
||||
|
||||
for (var i = 0, il = links.length; i < il; i += 1) {
|
||||
linkLength += this._getInnerText(links[i]).length;
|
||||
}
|
||||
// XXX implement _reduceNodeList?
|
||||
this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
|
||||
linkLength += this._getInnerText(linkNode).length;
|
||||
});
|
||||
|
||||
return linkLength / textLength;
|
||||
},
|
||||
@ -1405,28 +1426,26 @@ Readability.prototype = {
|
||||
* @return void
|
||||
**/
|
||||
_clean: function(e, tag) {
|
||||
var targetList = e.getElementsByTagName(tag);
|
||||
var isEmbed = (tag === 'object' || tag === 'embed');
|
||||
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
||||
|
||||
for (var y = targetList.length - 1; y >= 0; y -= 1) {
|
||||
this._forEachNode(e.getElementsByTagName(tag), function(element) {
|
||||
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||
if (isEmbed) {
|
||||
var attributeValues = "";
|
||||
for (var i = 0, il = targetList[y].attributes.length; i < il; i += 1) {
|
||||
attributeValues += targetList[y].attributes[i].value + '|';
|
||||
}
|
||||
var attributeValues = [].map.call(element.attributes, function(attr) {
|
||||
return attr.value;
|
||||
}).join("|");
|
||||
|
||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||
if (this.REGEXPS.videos.test(attributeValues))
|
||||
continue;
|
||||
return;
|
||||
|
||||
// Then check the elements inside this element for the same.
|
||||
if (this.REGEXPS.videos.test(targetList[y].innerHTML))
|
||||
continue;
|
||||
if (this.REGEXPS.videos.test(element.innerHTML))
|
||||
return;
|
||||
}
|
||||
|
||||
targetList[y].parentNode.removeChild(targetList[y]);
|
||||
}
|
||||
element.parentNode.removeChild(element);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
@ -1578,7 +1597,7 @@ Readability.prototype = {
|
||||
if (!metadata.excerpt) {
|
||||
var paragraphs = articleContent.getElementsByTagName("p");
|
||||
if (paragraphs.length > 0) {
|
||||
metadata.excerpt = paragraphs[0].textContent;
|
||||
metadata.excerpt = paragraphs[0].textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user