Skip to content

Commit

Permalink
Fix lazy-loaded images are not visible in Kinja sites (mozilla#590)
Browse files Browse the repository at this point in the history
* Add initial test case for kinja's lazy image

* Implement method to remove small data uri image

* Convert relative uri in poster and srcset of media nodes

* Eslint doesn't like arrow function

* Unescape HTML entities in metadata

* Fix wrong regex for parsing srcset urls

* Remove line to check data url since it already handled by new URL

* Replace String.matchAll since it only supported in Node 12+

* Use numeric code when unescaping HTML

* Don't remove data URL src if it's svg

* Don't remove b64 src if it's the only attr that contains image

* Make the comma part non-optional in regex for srcset url

* Fix wrong code for unescaping HTML

* Don't capture comma and semicolon in data URL regex
  • Loading branch information
RadhiFadlillah authored Apr 13, 2020
1 parent d5621f8 commit 52ab9b5
Show file tree
Hide file tree
Showing 16 changed files with 4,937 additions and 305 deletions.
140 changes: 117 additions & 23 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ Readability.prototype = {
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
hasContent: /\S$/,
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
},

DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
Expand All @@ -155,6 +157,15 @@ Readability.prototype = {
// These are the classes that readability sets itself.
CLASSES_TO_PRESERVE: [ "page" ],

// These are the list of HTML entities that need to be escaped.
HTML_ESCAPE_MAP: {
"lt": "<",
"gt": ">",
"amp": "&",
"quot": '"',
"apos": "'",
},

/**
* Run any post-process modifications to article content as necessary.
*
Expand Down Expand Up @@ -328,6 +339,7 @@ Readability.prototype = {
if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
}

// Otherwise, resolve against base URI:
try {
return new URL(uri, baseURI).href;
Expand Down Expand Up @@ -362,11 +374,29 @@ Readability.prototype = {
}
});

var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
this._forEachNode(imgs, function(img) {
var src = img.getAttribute("src");
var medias = this._getAllNodesWithTag(articleContent, [
"img", "picture", "figure", "video", "audio", "source"
]);

this._forEachNode(medias, function(media) {
var src = media.getAttribute("src");
var poster = media.getAttribute("poster");
var srcset = media.getAttribute("srcset");

if (src) {
img.setAttribute("src", toAbsoluteURI(src));
media.setAttribute("src", toAbsoluteURI(src));
}

if (poster) {
media.setAttribute("poster", toAbsoluteURI(poster));
}

if (srcset) {
var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
return toAbsoluteURI(p1) + (p2 || "") + p3;
});

media.setAttribute("srcset", newSrcset);
}
});
},
Expand Down Expand Up @@ -1239,6 +1269,26 @@ Readability.prototype = {
return false;
},

/**
* Converts some of the common HTML entities in string to their corresponding characters.
*
* @param str {string} - a string to unescape.
* @return string without HTML entity.
*/
_unescapeHtmlEntities: function(str) {
if (!str) {
return str;
}

var htmlEscapeMap = this.HTML_ESCAPE_MAP;
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
return htmlEscapeMap[tag];
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
var num = parseInt(hex || numStr, hex ? 16 : 10);
return String.fromCharCode(num);
});
},

/**
* Attempts to get excerpt and byline metadata for the article.
*
Expand Down Expand Up @@ -1319,6 +1369,13 @@ Readability.prototype = {
// get site name
metadata.siteName = values["og:site_name"];

// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
metadata.title = this._unescapeHtmlEntities(metadata.title);
metadata.byline = this._unescapeHtmlEntities(metadata.byline);
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);

return metadata;
},

Expand Down Expand Up @@ -1745,30 +1802,67 @@ Readability.prototype = {
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
_fixLazyImages: function (root) {
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
// So, here we check if the data uri is too short, just might as well remove it.
if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
if (parts[1] === "image/svg+xml") {
return;
}

// Make sure this element has other attributes which contains image.
// If it doesn't, then this src is important and shouldn't be removed.
var srcCouldBeRemoved = false;
for (var i = 0; i < elem.attributes.length; i++) {
var attr = elem.attributes[i];
if (attr.name === "src" || attr.name === "srcset") {
if (attr.name === "src") {
continue;
}
var copyTo = null;
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
copyTo = "srcset";
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
copyTo = "src";

if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
srcCouldBeRemoved = true;
break;
}
}

// Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
// it will be too small, therefore it might be placeholder image.
if (srcCouldBeRemoved) {
var b64starts = elem.src.search(/base64\s*/i) + 7;
var b64length = elem.src.length - b64starts;
if (b64length < 133) {
elem.removeAttribute("src");
}
if (copyTo) {
//if this is an img or picture, set the attribute directly
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
elem.setAttribute(copyTo, attr.value);
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
//see the nytimes-3 testcase for an example
var img = this._doc.createElement("img");
img.setAttribute(copyTo, attr.value);
elem.appendChild(img);
}
}
}

// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
return;
}

for (var j = 0; j < elem.attributes.length; j++) {
attr = elem.attributes[j];
if (attr.name === "src" || attr.name === "srcset") {
continue;
}
var copyTo = null;
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
copyTo = "srcset";
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
copyTo = "src";
}
if (copyTo) {
//if this is an img or picture, set the attribute directly
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
elem.setAttribute(copyTo, attr.value);
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
//see the nytimes-3 testcase for an example
var img = this._doc.createElement("img");
img.setAttribute(copyTo, attr.value);
elem.appendChild(img);
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions test/test-pages/data-url-image/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"title": "Document",
"byline": null,
"dir": null,
"excerpt": "Lorem ipsum dolor sit amet consectetur adipisicing elit. Natus eaque totam provident obcaecati nisi praesentium iusto velit fuga debitis quidem ut repellat corrupti, eligendi inventore quibusdam perspiciatis delectus omnis pariatur excepturi quasi fugit? A adipisci natus nostrum, qui aperiam, at culpa corrupti autem enim earum vitae. Nostrum et officiis facere ex recusandae tenetur, delectus odit provident soluta id perferendis ducimus quibusdam corporis rerum voluptatem architecto sequi beatae quod mollitia voluptatibus earum tempora inventore ut. Deserunt reprehenderit recusandae nostrum, eaque fuga cum, repellat, perspiciatis ducimus in non consequatur ratione. Sint rerum necessitatibus deleniti odio earum voluptatum eos modi ab dolor minus.",
"siteName": null,
"readerable": true
}
11 changes: 11 additions & 0 deletions test/test-pages/data-url-image/expected.html

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions test/test-pages/data-url-image/source.html

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions test/test-pages/lazy-image-2/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"title": "The Spectacular Story Of Metroid, One Of Gaming's Richest Universes",
"byline": "Mama Robotnik",
"dir": null,
"excerpt": "Nothing beats the passion of a true fan writing about something they love. That's what you're about to see here: one of the richest, most amazing tributes to a great gaming series that we've ever run on Kotaku. Warning #1: this one might make your browser chug, so close your other tabs. Warning #2: This piece might make it hurt a little more than there are no new Metroid games from Nintendo on the horizon.",
"siteName": "Kotaku",
"readerable": true
}
Loading

0 comments on commit 52ab9b5

Please sign in to comment.