Skip to content

Commit

Permalink
Include more ancestors in candidate scoring (mozilla#611)
Browse files Browse the repository at this point in the history
* include more ancestors in candidate scoring

* fix medium-3 testcase

The original source file contained two copies of the document, which
was causing incorrect results

* remove unnecessary nested elements

* fix removal of empty elements

* add option to regenerate all testcases

* update tests

* fix quanta testcase

* fix creating testcase from network

* fix early exit in testcase generation

* format HTML before comparing while testing

* upgrade js-beautify

* don't merge outer readability div
  • Loading branch information
PalmerAL authored Aug 21, 2020
1 parent 80d818a commit 3844d8f
Show file tree
Hide file tree
Showing 178 changed files with 10,339 additions and 12,354 deletions.
27 changes: 26 additions & 1 deletion Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ Readability.prototype = {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);

this._simplifyNestedElements(articleContent);

if (!this._keepClasses) {
// Remove classes.
this._cleanClasses(articleContent);
Expand Down Expand Up @@ -422,6 +424,29 @@ Readability.prototype = {
});
},

_simplifyNestedElements: function(articleContent) {
var node = articleContent;

while (node) {
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
if (this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
} else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
var child = node.children[0];
for (var i = 0; i < node.attributes.length; i++) {
child.setAttribute(node.attributes[i].name, node.attributes[i].value);
}
node.parentNode.replaceChild(child, node);
node = child;
continue;
}
}

node = this._getNextNode(node);
}
},

/**
* Get the article title as an H1.
*
Expand Down Expand Up @@ -970,7 +995,7 @@ Readability.prototype = {
return;

// Exclude nodes with no ancestor.
var ancestors = this._getNodeAncestors(elementToScore, 3);
var ancestors = this._getNodeAncestors(elementToScore, 5);
if (ancestors.length === 0)
return;

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"chai": "^2.1.*",
"eslint": ">=4.2",
"htmltidy2": "^0.3.0",
"js-beautify": "^1.5.5",
"js-beautify": "^1.13.0",
"jsdom": "^13.1",
"matcha": "^0.6.0",
"mocha": "^2.2.*",
Expand Down
83 changes: 51 additions & 32 deletions test/generate-testcase.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,41 @@ var htmltidy = require("htmltidy2").tidy;
var { Readability, isProbablyReaderable } = require("../index");
var JSDOMParser = require("../JSDOMParser");

var FFX_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0";
var FFX_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:80.0) Gecko/20100101 Firefox/80.0";

if (process.argv.length < 3) {
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
process.exit(0);
throw "Abort";
}
var testcaseRoot = path.join(__dirname, "test-pages");

var slug = process.argv[2];
var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.

var destRoot = path.join(__dirname, "test-pages", slug);

fs.mkdir(destRoot, function(err) {
if (err) {
var sourceFile = path.join(destRoot, "source.html");
fs.exists(sourceFile, function(exists) {
if (exists) {
fs.readFile(sourceFile, {encoding: "utf-8"}, function(readFileErr, data) {
if (readFileErr) {
console.error("Source existed but couldn't be read?");
process.exit(1);
return;
}
onResponseReceived(null, data);
});
} else {
fetchSource(argURL, onResponseReceived);
}
function generateTestcase(slug) {
var destRoot = path.join(testcaseRoot, slug);

fs.mkdir(destRoot, function(err) {
if (err) {
var sourceFile = path.join(destRoot, "source.html");
fs.exists(sourceFile, function(exists) {
if (exists) {
fs.readFile(sourceFile, {encoding: "utf-8"}, function(readFileErr, data) {
if (readFileErr) {
console.error("Source existed but couldn't be read?");
process.exit(1);
return;
}
onResponseReceived(null, data, destRoot);
});
} else {
fetchSource(argURL, function(fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot);
});
}
});
return;
}
fetchSource(argURL, function(fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot);
});
return;
}
fetchSource(argURL, onResponseReceived);
});
});
}

function fetchSource(url, callbackFn) {
if (!url) {
Expand Down Expand Up @@ -88,7 +89,7 @@ function sanitizeSource(html, callbackFn) {
}, callbackFn);
}

function onResponseReceived(error, source) {
function onResponseReceived(error, source, destRoot) {
if (error) {
console.error("Couldn't tidy source html!");
console.error(error);
Expand Down Expand Up @@ -159,9 +160,27 @@ function runReadability(source, destPath, metadataDestPath) {
console.error("Couldn't write data to expected-metadata.json!");
console.error(metadataWriteErr);
}

process.exit(0);
});
});
}

if (process.argv.length < 3) {
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
process.exit(0);
throw "Abort";
}

if (process.argv[2] === "all") {
fs.readdir(testcaseRoot, function(err, files) {
if (err) {
console.error("error reading testcaseses");
return;
}

files.forEach(function(file) {
generateTestcase(file);
});
});
} else {
generateTestcase(process.argv[2]);
}
5 changes: 3 additions & 2 deletions test/test-pages/001/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"title": "Get your Frontend JavaScript Code Covered | Code",
"byline": "Nicolas Perriault —",
"dir": null,
"excerpt": "Nicolas Perriault's homepage.",
"readerable": true,
"siteName": null
"siteName": null,
"readerable": true
}
41 changes: 22 additions & 19 deletions test/test-pages/001/expected.html
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
<div id="readability-page-1" class="page">
<section>
<p><strong>So finally you're <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">testing your frontend JavaScript code</a>? Great! The more you
write tests, the more confident you are with your code… but how much precisely?
That's where <a href="http://en.wikipedia.org/wiki/Code_coverage">code coverage</a> might
help.</strong> </p>
<p><strong>So finally you're <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">testing your frontend JavaScript code</a>? Great! The more you write tests, the more confident you are with your code… but how much precisely? That's where <a href="http://en.wikipedia.org/wiki/Code_coverage">code coverage</a> might help.</strong>
</p>
<p>The idea behind code coverage is to record which parts of your code (functions, statements, conditionals and so on) have been executed by your test suite, to compute metrics out of these data and usually to provide tools for navigating and inspecting them.</p>
<p>Not a lot of frontend developers I know actually test their frontend code, and I can barely imagine how many of them have ever setup code coverage… Mostly because there are not many frontend-oriented tools in this area I guess.</p>
<p>Actually I've only found one which provides an adapter for <a href="http://visionmedia.github.io/mocha/">Mocha</a> and actually works…</p>
<blockquote>
<p>Drinking game for web devs:
<br/>(1) Think of a noun
<br/>(2) Google "&lt;noun&gt;.js"
<br/>(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay) <a href="https://twitter.com/ironshay/statuses/370525864523743232">August 22, 2013</a> </blockquote>
<p><strong><a href="http://blanketjs.org/">Blanket.js</a></strong> is an <em>easy to install, easy to configure,
and easy to use JavaScript code coverage library that works both in-browser and
with nodejs.</em> </p>
<p>Its use is dead easy, adding Blanket support to your Mocha test suite is just matter of adding this simple line to your HTML test file:</p> <pre><code>&lt;script src="vendor/blanket.js"
<p>Drinking game for web devs: <br />(1) Think of a noun <br />(2) Google "&lt;noun&gt;.js" <br />(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay) <a href="https://twitter.com/ironshay/statuses/370525864523743232">August 22, 2013</a>
</blockquote>
<p><strong><a href="http://blanketjs.org/">Blanket.js</a></strong> is an <em>easy to install, easy to configure, and easy to use JavaScript code coverage library that works both in-browser and with nodejs.</em>
</p>
<p>Its use is dead easy, adding Blanket support to your Mocha test suite is just matter of adding this simple line to your HTML test file:</p>
<pre><code>&lt;script src="vendor/blanket.js"
data-cover-adapter="vendor/mocha-blanket.js"&gt;&lt;/script&gt;
</code></pre>
<p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js">blanket.js</a>, <a href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js">mocha-blanket.js</a> </p>
<p>As an example, let's reuse the silly <code>Cow</code> example we used <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">in a previous episode</a>:</p> <pre><code>// cow.js
<p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js">blanket.js</a>, <a href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js">mocha-blanket.js</a>
</p>
<p>As an example, let's reuse the silly <code>Cow</code> example we used <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">in a previous episode</a>:</p>
<pre><code>// cow.js
(function(exports) {
"use strict";

Expand All @@ -37,7 +35,8 @@
};
})(this);
</code></pre>
<p>And its test suite, powered by Mocha and <a href="http://chaijs.com/">Chai</a>:</p> <pre><code>var expect = chai.expect;
<p>And its test suite, powered by Mocha and <a href="http://chaijs.com/">Chai</a>:</p>
<pre><code>var expect = chai.expect;

describe("Cow", function() {
describe("constructor", function() {
Expand All @@ -60,7 +59,8 @@
});
});
</code></pre>
<p>Let's create the HTML test file for it, featuring Blanket and its adapter for Mocha:</p> <pre><code>&lt;!DOCTYPE html&gt;
<p>Let's create the HTML test file for it, featuring Blanket and its adapter for Mocha:</p>
<pre><code>&lt;!DOCTYPE html&gt;
&lt;html&gt;
&lt;head&gt;
&lt;meta charset="utf-8"&gt;
Expand Down Expand Up @@ -88,9 +88,12 @@
<li>The HTML test file <em>must</em> be served over HTTP for the adapter to be loaded.</li>
</ul>
<p>Running the tests now gives us something like this:</p>
<p> <img alt="screenshot" src="http://fakehost/static/code/2013/blanket-coverage.png"/> </p>
<p>
<img alt="screenshot" src="http://fakehost/static/code/2013/blanket-coverage.png" />
</p>
<p>As you can see, the report at the bottom highlights that we haven't actually tested the case where an error is raised in case a target name is missing. We've been informed of that, nothing more, nothing less. We simply know we're missing a test here. Isn't this cool? I think so!</p>
<p>Just remember that code coverage will only <a href="http://codebetter.com/karlseguin/2008/12/09/code-coverage-use-it-wisely/">bring you numbers</a> and raw information, not actual proofs that the whole of your <em>code logic</em> has been actually covered. If you ask me, the best inputs you can get about your code logic and implementation ever are the ones issued out of <a href="http://www.extremeprogramming.org/rules/pair.html">pair programming</a> sessions and <a href="http://alexgaynor.net/2013/sep/26/effective-code-review/">code reviews</a> — but that's another story.</p>
<p><strong>So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing!</strong> </p>
<p><strong>So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing!</strong>
</p>
</section>
</div>
</div>
5 changes: 3 additions & 2 deletions test/test-pages/002/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"title": "This API is so Fetching!",
"byline": "Nikhil Marathe",
"dir": null,
"excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...",
"readerable": true,
"siteName": "Mozilla Hacks – the Web developer blog"
"siteName": "Mozilla Hacks – the Web developer blog",
"readerable": true
}
Loading

0 comments on commit 3844d8f

Please sign in to comment.