Fix lazy-loaded images are not visible in Kinja sites (mozilla#590)

* Add initial test case for kinja's lazy image * Implement method to remove small data uri image * Convert relative uri in poster and srcset of media nodes * Eslint doesn't like arrow function * Unescape HTML entities in metadata * Fix wrong regex for parsing srcset urls * Remove line to check data url since it already handled by new URL * Replace String.matchAll since it only supported in Node 12+ * Use numeric code when unescaping HTML * Don't remove data URL src if it's svg * Don't remove b64 src if it's the only attr that contains image * Make the comma part non-optional in regex for srcset url * Fix wrong code for unescaping HTML * Don't capture comma and semicolon in data URL regex
royalgarter · Apr 13, 2020 · 52ab9b5 · 52ab9b5
1 parent d5621f8
commit 52ab9b5
Show file tree

Hide file tree

Showing 16 changed files with 4,937 additions and 305 deletions.
diff --git a/Readability.js b/Readability.js
@@ -131,6 +131,8 @@ Readability.prototype = {
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
+    srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
+    b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
@@ -155,6 +157,15 @@ Readability.prototype = {
   // These are the classes that readability sets itself.
   CLASSES_TO_PRESERVE: [ "page" ],
 
+  // These are the list of HTML entities that need to be escaped.
+  HTML_ESCAPE_MAP: {
+    "lt": "<",
+    "gt": ">",
+    "amp": "&",
+    "quot": '"',
+    "apos": "'",
+  },
+
   /**
    * Run any post-process modifications to article content as necessary.
    *
@@ -328,6 +339,7 @@ Readability.prototype = {
       if (baseURI == documentURI && uri.charAt(0) == "#") {
         return uri;
       }
+
       // Otherwise, resolve against base URI:
       try {
         return new URL(uri, baseURI).href;
@@ -362,11 +374,29 @@ Readability.prototype = {
       }
     });
 
-    var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
-    this._forEachNode(imgs, function(img) {
-      var src = img.getAttribute("src");
+    var medias = this._getAllNodesWithTag(articleContent, [
+      "img", "picture", "figure", "video", "audio", "source"
+    ]);
+
+    this._forEachNode(medias, function(media) {
+      var src = media.getAttribute("src");
+      var poster = media.getAttribute("poster");
+      var srcset = media.getAttribute("srcset");
+
       if (src) {
-        img.setAttribute("src", toAbsoluteURI(src));
+        media.setAttribute("src", toAbsoluteURI(src));
+      }
+
+      if (poster) {
+        media.setAttribute("poster", toAbsoluteURI(poster));
+      }
+
+      if (srcset) {
+        var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
+          return toAbsoluteURI(p1) + (p2 || "") + p3;
+        });
+
+        media.setAttribute("srcset", newSrcset);
       }
     });
   },
@@ -1239,6 +1269,26 @@ Readability.prototype = {
     return false;
   },
 
+  /**
+   * Converts some of the common HTML entities in string to their corresponding characters.
+   *
+   * @param str {string} - a string to unescape.
+   * @return string without HTML entity.
+   */
+  _unescapeHtmlEntities: function(str) {
+    if (!str) {
+      return str;
+    }
+
+    var htmlEscapeMap = this.HTML_ESCAPE_MAP;
+    return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
+      return htmlEscapeMap[tag];
+    }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
+      var num = parseInt(hex || numStr, hex ? 16 : 10);
+      return String.fromCharCode(num);
+    });
+  },
+
   /**
    * Attempts to get excerpt and byline metadata for the article.
    *
@@ -1319,6 +1369,13 @@ Readability.prototype = {
     // get site name
     metadata.siteName = values["og:site_name"];
 
+    // in many sites the meta value is escaped with HTML entities,
+    // so here we need to unescape it
+    metadata.title = this._unescapeHtmlEntities(metadata.title);
+    metadata.byline = this._unescapeHtmlEntities(metadata.byline);
+    metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
+    metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
+
     return metadata;
   },
 
@@ -1745,30 +1802,67 @@ Readability.prototype = {
   /* convert images and figures that have properties like data-src into images that can be loaded without JS */
   _fixLazyImages: function (root) {
     this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
-      // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
-      if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
+      // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
+      // So, here we check if the data uri is too short, just might as well remove it.
+      if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
+        // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
+        var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
+        if (parts[1] === "image/svg+xml") {
+          return;
+        }
+
+        // Make sure this element has other attributes which contains image.
+        // If it doesn't, then this src is important and shouldn't be removed.
+        var srcCouldBeRemoved = false;
         for (var i = 0; i < elem.attributes.length; i++) {
           var attr = elem.attributes[i];
-          if (attr.name === "src" || attr.name === "srcset") {
+          if (attr.name === "src") {
             continue;
           }
-          var copyTo = null;
-          if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
-            copyTo = "srcset";
-          } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
-            copyTo = "src";
+
+          if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+            srcCouldBeRemoved = true;
+            break;
+          }
+        }
+
+        // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
+        // it will be too small, therefore it might be placeholder image.
+        if (srcCouldBeRemoved) {
+          var b64starts = elem.src.search(/base64\s*/i) + 7;
+          var b64length = elem.src.length - b64starts;
+          if (b64length < 133) {
+            elem.removeAttribute("src");
           }
-          if (copyTo) {
-            //if this is an img or picture, set the attribute directly
-            if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
-              elem.setAttribute(copyTo, attr.value);
-            } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
-              //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
-              //see the nytimes-3 testcase for an example
-              var img = this._doc.createElement("img");
-              img.setAttribute(copyTo, attr.value);
-              elem.appendChild(img);
-            }
+        }
+      }
+
+      // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
+      if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
+        return;
+      }
+
+      for (var j = 0; j < elem.attributes.length; j++) {
+        attr = elem.attributes[j];
+        if (attr.name === "src" || attr.name === "srcset") {
+          continue;
+        }
+        var copyTo = null;
+        if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
+          copyTo = "srcset";
+        } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
+          copyTo = "src";
+        }
+        if (copyTo) {
+          //if this is an img or picture, set the attribute directly
+          if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
+            elem.setAttribute(copyTo, attr.value);
+          } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
+            //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
+            //see the nytimes-3 testcase for an example
+            var img = this._doc.createElement("img");
+            img.setAttribute(copyTo, attr.value);
+            elem.appendChild(img);
           }
         }
       }

diff --git a/test/test-pages/data-url-image/expected-metadata.json b/test/test-pages/data-url-image/expected-metadata.json
@@ -0,0 +1,8 @@
+{
+  "title": "Document",
+  "byline": null,
+  "dir": null,
+  "excerpt": "Lorem ipsum dolor sit amet consectetur adipisicing elit. Natus eaque totam provident obcaecati nisi praesentium iusto velit fuga debitis quidem ut repellat corrupti, eligendi inventore quibusdam perspiciatis delectus omnis pariatur excepturi quasi fugit? A adipisci natus nostrum, qui aperiam, at culpa corrupti autem enim earum vitae. Nostrum et officiis facere ex recusandae tenetur, delectus odit provident soluta id perferendis ducimus quibusdam corporis rerum voluptatem architecto sequi beatae quod mollitia voluptatibus earum tempora inventore ut. Deserunt reprehenderit recusandae nostrum, eaque fuga cum, repellat, perspiciatis ducimus in non consequatur ratione. Sint rerum necessitatibus deleniti odio earum voluptatum eos modi ab dolor minus.",
+  "siteName": null,
+  "readerable": true
+}
diff --git a/test/test-pages/data-url-image/expected.html b/test/test-pages/data-url-image/expected.html
diff --git a/test/test-pages/data-url-image/source.html b/test/test-pages/data-url-image/source.html
diff --git a/test/test-pages/lazy-image-2/expected-metadata.json b/test/test-pages/lazy-image-2/expected-metadata.json
@@ -0,0 +1,8 @@
+{
+  "title": "The Spectacular Story Of Metroid, One Of Gaming's Richest Universes",
+  "byline": "Mama Robotnik",
+  "dir": null,
+  "excerpt": "Nothing beats the passion of a true fan writing about something they love. That's what you're about to see here: one of the richest, most amazing tributes to a great gaming series that we've ever run on Kotaku. Warning #1: this one might make your browser chug, so close your other tabs. Warning #2: This piece might make it hurt a little more than there are no new Metroid games from Nintendo on the horizon.",
+  "siteName": "Kotaku",
+  "readerable": true
+}