No bug: update readability libs to the up-to-date github versions. rs=me+Gijs

--HG-- extra : rebase_source : a3fe5635643ee253f82764f8e54409982978703e
2025-02-05 05:30:29 +00:00 · 2015-03-23 11:29:23 -07:00 · 2015-03-23 11:29:23 -07:00 · faf2258dd6
commit faf2258dd6
parent 7b11753f7f
1 changed files with 116 additions and 97 deletions
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@ -103,7 +103,7 @@ Readability.prototype = {
    byline: /byline|author|dateline|writtenby/i,
    replaceFonts: /<(\/?)font[^>]*>/gi,
    normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    prevLink: /(prev|earl|old|new|<|«)/i,
    whitespace: /^\s*$/,
@ -125,6 +125,36 @@ Readability.prototype = {
    this._fixRelativeUris(articleContent);
  },

+  /**
+   * Iterate over a NodeList, which doesn't natively fully implement the Array
+   * interface.
+   *
+   * For convenience, the current object context is applied to the provided
+   * iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return void
+   */
+  _forEachNode: function(nodeList, fn) {
+    return Array.prototype.forEach.call(nodeList, fn, this);
+  },
+
+  /**
+   * Iterate over a NodeList, return true if any of the provided iterate
+   * function calls returns true, false otherwise.
+   *
+   * For convenience, the current object context is applied to the
+   * provided iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return Boolean
+   */
+  _someNode: function(nodeList, fn) {
+    return Array.prototype.some.call(nodeList, fn, this);
+  },
+
  /**
   * Converts each <a> and <img> uri in the given element to an absolute URI.
   *
@ -149,6 +179,10 @@ Readability.prototype = {
      if (uri[0] == "/")
        return prePath + uri;

+      // Dotslash relative URI.
+      if (uri.indexOf("./") === 0)
+        return pathBase + uri.slice(2);
+
      // Standard relative URI; add entire path. pathBase already includes a
      // trailing "/".
      return pathBase + uri;
@ -156,19 +190,18 @@ Readability.prototype = {

    function convertRelativeURIs(tagName, propName) {
      var elems = articleContent.getElementsByTagName(tagName);
-      for (var i = elems.length; --i >= 0;) {
-        var elem = elems[i];
+      this._forEachNode(elems, function(elem) {
        var relativeURI = elem.getAttribute(propName);
        if (relativeURI != null)
-          elems[i].setAttribute(propName, toAbsoluteURI(relativeURI));
-      }
+          elem.setAttribute(propName, toAbsoluteURI(relativeURI));
+      });
    }

     // Fix links.
-    convertRelativeURIs("a", "href");
+    convertRelativeURIs.call(this, "a", "href");

     // Fix images.
-    convertRelativeURIs("img", "src");
+    convertRelativeURIs.call(this, "img", "src");
  },

  /**
@ -224,19 +257,17 @@ Readability.prototype = {
    var doc = this._doc;

    // Remove all style tags in head
-    var styleTags = doc.getElementsByTagName("style");
-    for (var st = styleTags.length - 1; st >= 0; st -= 1) {
-      styleTags[st].parentNode.removeChild(styleTags[st]);
-    }
+    this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
+      styleNode.parentNode.removeChild(styleNode);
+    });

    if (doc.body) {
      this._replaceBrs(doc.body);
    }

-    var fonts = doc.getElementsByTagName("FONT");
-    for (var i = fonts.length; --i >=0;) {
-      this._setNodeTag(fonts[i], "SPAN");
-    }
+    this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
+      this._setNodeTag(fontNode, "SPAN");
+    });
  },

  /**
@ -262,9 +293,7 @@ Readability.prototype = {
   *   <div>foo<br>bar<p>abc</p></div>
   */
  _replaceBrs: function (elem) {
-    var brs = elem.getElementsByTagName("br");
-    for (var i = 0; i < brs.length; i++) {
-      var br = brs[i];
+    this._forEachNode(elem.getElementsByTagName("br"), function(br) {
      var next = br.nextSibling;

      // Whether 2 or more <br> elements have been found and replaced with a
@ -303,7 +332,7 @@ Readability.prototype = {
          next = sibling;
        }
      }
-    }
+    });
  },

  _setNodeTag: function (node, tag) {
@ -326,6 +355,7 @@ Readability.prototype = {
    // Clean out junk from the article content
    this._cleanConditionally(articleContent, "form");
    this._clean(articleContent, "object");
+    this._clean(articleContent, "embed");
    this._clean(articleContent, "h1");

    // If there is only one h2, they are probably using it as a header
@ -343,26 +373,23 @@ Readability.prototype = {
    this._cleanConditionally(articleContent, "div");

    // Remove extra paragraphs
-    var articleParagraphs = articleContent.getElementsByTagName('p');
-    for (var i = articleParagraphs.length - 1; i >= 0; i -= 1) {
-      var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
-      var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
-      var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
+    this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
+      var imgCount = paragraph.getElementsByTagName('img').length;
+      var embedCount = paragraph.getElementsByTagName('embed').length;
+      var objectCount = paragraph.getElementsByTagName('object').length;
+      // At this point, nasty iframes have been removed, only remain embedded video ones.
+      var iframeCount = paragraph.getElementsByTagName('iframe').length;
+      var totalCount = imgCount + embedCount + objectCount + iframeCount;

-      if (imgCount === 0 &&
-        embedCount === 0 &&
-        objectCount === 0 &&
-        this._getInnerText(articleParagraphs[i], false) === '')
-        articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
-    }
+      if (totalCount === 0 && !this._getInnerText(paragraph, false))
+        paragraph.parentNode.removeChild(paragraph);
+    });

-    var brs = articleContent.getElementsByTagName("BR");
-    for (var i = brs.length; --i >= 0;) {
-      var br = brs[i];
+    this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
      var next = this._nextElement(br.nextSibling);
      if (next && next.tagName == "P")
        br.parentNode.removeChild(br);
-    }
+    });
  },

  /**
@ -529,8 +556,7 @@ Readability.prototype = {
            elementsToScore.push(node);
          } else {
            // EXPERIMENTAL
-            for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
-              var childNode = node.childNodes[i];
+            this._forEachNode(node.childNodes, function(childNode) {
              if (childNode.nodeType === Node.TEXT_NODE) {
                var p = doc.createElement('p');
                p.textContent = childNode.textContent;
@ -538,7 +564,7 @@ Readability.prototype = {
                p.className = 'readability-styled';
                node.replaceChild(p, childNode);
              }
-            }
+            });
          }
        }
        node = this._getNextNode(node);
@ -551,17 +577,17 @@ Readability.prototype = {
       * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
      **/
      var candidates = [];
-      for (var pt = 0; pt < elementsToScore.length; pt += 1) {
-        var parentNode = elementsToScore[pt].parentNode;
+      this._forEachNode(elementsToScore, function(elementToScore) {
+        var parentNode = elementToScore.parentNode;
        var grandParentNode = parentNode ? parentNode.parentNode : null;
-        var innerText = this._getInnerText(elementsToScore[pt]);
+        var innerText = this._getInnerText(elementToScore);

        if (!parentNode || typeof(parentNode.tagName) === 'undefined')
-          continue;
+          return;

        // If this paragraph is less than 25 characters, don't even count it.
        if (innerText.length < 25)
-          continue;
+          return;

        // Initialize readability data for the parent.
        if (typeof parentNode.readability === 'undefined') {
@ -593,7 +619,7 @@ Readability.prototype = {

        if (grandParentNode)
          grandParentNode.readability.contentScore += contentScore / 2;
-      }
+      });

      // After we've calculated scores, loop through all of the possible
      // candidate nodes we found and find the one with the highest score.
@ -650,9 +676,9 @@ Readability.prototype = {
        // below does some of that - but only if we've looked high enough up the DOM
        // tree.
        var parentOfTopCandidate = topCandidate.parentNode;
+        var lastScore = topCandidate.readability.contentScore;
        // The scores shouldn't get too low.
-        var scoreThreshold = topCandidate.readability.contentScore / 3;
-        var lastScore = parentOfTopCandidate.readability.contentScore;
+        var scoreThreshold = lastScore / 3;
        while (parentOfTopCandidate && parentOfTopCandidate.readability) {
          var parentScore = parentOfTopCandidate.readability.contentScore;
          if (parentScore < scoreThreshold)
@ -662,6 +688,7 @@ Readability.prototype = {
            topCandidate = parentOfTopCandidate;
            break;
          }
+          lastScore = parentOfTopCandidate.readability.contentScore;
          parentOfTopCandidate = parentOfTopCandidate.parentNode;
        }
      }
@ -804,7 +831,7 @@ Readability.prototype = {

  /**
   * Attempts to get excerpt and byline metadata for the article.
-   * 
+   *
   * @return Object with optional "excerpt" and "byline" properties
   */
  _getArticleMetadata: function() {
@ -820,14 +847,13 @@ Readability.prototype = {
    var propertyPattern = /^\s*og\s*:\s*description\s*$/gi;

    // Find description tags.
-    for (var i = 0; i < metaElements.length; i++) {
-      var element = metaElements[i];
+    this._forEachNode(metaElements, function(element) {
      var elementName = element.getAttribute("name");
      var elementProperty = element.getAttribute("property");

      if (elementName === "author") {
        metadata.byline = element.getAttribute("content");
-        continue;
+        return;
      }

      var name = null;
@ -846,7 +872,7 @@ Readability.prototype = {
          values[name] = content.trim();
        }
      }
-    }
+    });

    if ("description" in values) {
      metadata.excerpt = values["description"];
@ -867,14 +893,13 @@ Readability.prototype = {
   * @param Element
  **/
  _removeScripts: function(doc) {
-    var scripts = doc.getElementsByTagName('script');
-    for (var i = scripts.length - 1; i >= 0; i -= 1) {
-      scripts[i].nodeValue="";
-      scripts[i].removeAttribute('src');
+    this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
+      scriptNode.nodeValue = "";
+      scriptNode.removeAttribute('src');

-      if (scripts[i].parentNode)
-          scripts[i].parentNode.removeChild(scripts[i]);
-    }
+      if (scriptNode.parentNode)
+        scriptNode.parentNode.removeChild(scriptNode);
+    });
  },

  /**
@ -884,22 +909,17 @@ Readability.prototype = {
   *
   * @param Element
  **/
-  _hasSinglePInsideElement: function(e) {
+  _hasSinglePInsideElement: function(element) {
    // There should be exactly 1 element child which is a P:
-    if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
+    if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
      return false;
    }
-    // And there should be no text nodes with real content
-    var childNodes = e.childNodes;
-    for (var i = childNodes.length; --i >= 0;) {
-      var node = childNodes[i];
-      if (node.nodeType == Node.TEXT_NODE &&
-          this.REGEXPS.hasContent.test(node.textContent)) {
-        return false;
-      }
-    }

-    return true;
+    // And there should be no text nodes with real content
+    return !this._someNode(element.childNodes, function(node) {
+      return node.nodeType === Node.TEXT_NODE &&
+             this.REGEXPS.hasContent.test(node.textContent);
+    });
  },

  /**
@ -907,14 +927,11 @@ Readability.prototype = {
   *
   * @param Element
   */
-  _hasChildBlockElement: function (e) {
-    var length = e.children.length;
-    for (var i = 0; i < length; i++) {
-      var child = e.children[i];
-      if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
-        return true;
-    }
-    return false;
+  _hasChildBlockElement: function (element) {
+    return this._someNode(element.childNodes, function(node) {
+      return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
+             this._hasChildBlockElement(node);
+    });
  },

  /**
@ -922,11 +939,12 @@ Readability.prototype = {
   * This also strips out any excess whitespace to be found.
   *
   * @param Element
+   * @param Boolean normalizeSpaces (default: true)
   * @return string
  **/
  _getInnerText: function(e, normalizeSpaces) {
-    var textContent = e.textContent.trim();
    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
+    var textContent = e.textContent.trim();

    if (normalizeSpaces) {
      return textContent.replace(this.REGEXPS.normalize, " ");
@ -985,14 +1003,17 @@ Readability.prototype = {
   * @param Element
   * @return number (float)
  **/
-  _getLinkDensity: function(e) {
-    var links = e.getElementsByTagName("a");
-    var textLength = this._getInnerText(e).length;
+  _getLinkDensity: function(element) {
+    var textLength = this._getInnerText(element).length;
+    if (textLength === 0)
+      return;
+
    var linkLength = 0;

-    for (var i = 0, il = links.length; i < il; i += 1) {
-      linkLength += this._getInnerText(links[i]).length;
-    }
+    // XXX implement _reduceNodeList?
+    this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
+      linkLength += this._getInnerText(linkNode).length;
+    });

    return linkLength / textLength;
  },
@ -1405,28 +1426,26 @@ Readability.prototype = {
   * @return void
   **/
  _clean: function(e, tag) {
-    var targetList = e.getElementsByTagName(tag);
-    var isEmbed = (tag === 'object' || tag === 'embed');
+    var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;

-    for (var y = targetList.length - 1; y >= 0; y -= 1) {
+    this._forEachNode(e.getElementsByTagName(tag), function(element) {
      // Allow youtube and vimeo videos through as people usually want to see those.
      if (isEmbed) {
-        var attributeValues = "";
-        for (var i = 0, il = targetList[y].attributes.length; i < il; i += 1) {
-          attributeValues += targetList[y].attributes[i].value + '|';
-        }
+        var attributeValues = [].map.call(element.attributes, function(attr) {
+          return attr.value;
+        }).join("|");

        // First, check the elements attributes to see if any of them contain youtube or vimeo
        if (this.REGEXPS.videos.test(attributeValues))
-          continue;
+          return;

        // Then check the elements inside this element for the same.
-        if (this.REGEXPS.videos.test(targetList[y].innerHTML))
-          continue;
+        if (this.REGEXPS.videos.test(element.innerHTML))
+          return;
      }

-      targetList[y].parentNode.removeChild(targetList[y]);
-    }
+      element.parentNode.removeChild(element);
+    });
  },

  /**
@ -1578,7 +1597,7 @@ Readability.prototype = {
    if (!metadata.excerpt) {
      var paragraphs = articleContent.getElementsByTagName("p");
      if (paragraphs.length > 0) {
-        metadata.excerpt = paragraphs[0].textContent;
+        metadata.excerpt = paragraphs[0].textContent.trim();
      }
    }