Skip to content

Commit

Permalink
Put phrasing content into paragraphs
Browse files Browse the repository at this point in the history
This removes the need for `p.readability-styled` elements.
  • Loading branch information
davidar authored and gijsk committed May 15, 2018
1 parent c823a6e commit 9f2c5cb
Show file tree
Hide file tree
Showing 37 changed files with 398 additions and 357 deletions.
85 changes: 64 additions & 21 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,19 @@ Readability.prototype = {

DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],

// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
PHRASING_ELEMS: [
// "CANVAS", "IFRAME", "SVG", "VIDEO",
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
"DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
"MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
"RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
"SUP", "TEXTAREA", "TIME", "VAR", "WBR"
],

// These are the classes that readability sets itself.
CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
CLASSES_TO_PRESERVE: [ "page" ],

/**
* Run any post-process modifications to article content as necessary.
Expand Down Expand Up @@ -217,6 +228,21 @@ Readability.prototype = {
return Array.prototype.some.call(nodeList, fn, this);
},

/**
* Iterate over a NodeList, return true if all of the provided iterate
* function calls return true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return Boolean
*/
_everyNode: function(nodeList, fn) {
return Array.prototype.every.call(nodeList, fn, this);
},

/**
* Concat all nodelists passed as arguments.
*
Expand Down Expand Up @@ -787,6 +813,26 @@ Readability.prototype = {

// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
// Put phrasing content into paragraphs.
var p = null;
var childNode = node.firstChild;
while (childNode) {
var nextSibling = childNode.nextSibling;
if (this._isPhrasingContent(childNode)) {
if (p !== null) {
p.appendChild(childNode);
} else if (childNode.nodeType !== this.TEXT_NODE ||
childNode.textContent.trim().length > 0) {
p = doc.createElement('p');
node.replaceChild(p, childNode);
p.appendChild(childNode);
}
} else {
p = null;
}
childNode = nextSibling;
}

// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
Expand All @@ -799,17 +845,6 @@ Readability.prototype = {
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
if (childNode.nodeType === this.TEXT_NODE && childNode.textContent.trim().length > 0) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
node.replaceChild(p, childNode);
}
});
}
}
node = this._getNextNode(node);
Expand Down Expand Up @@ -1274,6 +1309,16 @@ Readability.prototype = {
});
},

/***
* Determine if a node qualifies as phrasing content.
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
**/
_isPhrasingContent: function(node) {
return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
this._everyNode(node.childNodes, this._isPhrasingContent));
},

/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
Expand Down Expand Up @@ -1315,16 +1360,14 @@ Readability.prototype = {
if (!e || e.tagName.toLowerCase() === 'svg')
return;

if (e.className !== 'readability-styled') {
// Remove `style` and deprecated presentational attributes
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
}
// Remove `style` and deprecated presentational attributes
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
}

if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
e.removeAttribute('width');
e.removeAttribute('height');
}
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
e.removeAttribute('width');
e.removeAttribute('height');
}

var cur = e.firstElementChild;
Expand Down
13 changes: 8 additions & 5 deletions test/test-pages/ars-1/expected.html
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
<div id="readability-page-1" class="page">
<div itemprop="articleBody">
<figure> <img src="http://cdn.arstechnica.net/wp-content/uploads/2015/04/server-crash-640x426.jpg" width="640" height="331"/>
<figcaption class="caption"> </figcaption>
<figure> <img src="http://cdn.arstechnica.net/wp-content/uploads/2015/04/server-crash-640x426.jpg" width="640" height="331" />
<figcaption class="caption">
<p><a rel="nofollow" href="https://en.wikipedia.org/wiki/Kernel_panic#/media/File:Kernel-panic.jpg">Kevin</a></p>
</figcaption>
</figure>
<p>A flaw in the wildly popular online game <em>Minecraft</em> makes it easy for just about anyone to crash the server hosting the game, according to a computer programmer who has released proof-of-concept code that exploits the vulnerability.</p>
<p>"I thought a lot before writing this post," Pakistan-based developer Ammar Askar wrote in a <a href="http://blog.ammaraskar.com/minecraft-vulnerability-advisory">blog post published Thursday</a>, 21 months, he said, after privately reporting the bug to <em>Minecraft</em> developer Mojang. "On the one hand I don't want to expose thousands of servers to a major vulnerability, yet on the other hand Mojang has failed to act on it."</p>
<p>The bug resides in the <a href="https://github.com/ammaraskar/pyCraft">networking internals of the <em>Minecraft </em>protocol</a>. It allows the contents of inventory slots to be exchanged, so that, among other things, items in players' hotbars are displayed automatically after logging in. <em>Minecraft</em> items can also store arbitrary metadata in a file format known as <a href="http://wiki.vg/NBT">Named Binary Tag (NBT)</a>, which allows complex data structures to be kept in hierarchical nests. Askar has released <a href="https://github.com/ammaraskar/pyCraft/tree/nbt_exploit">proof-of-concept attack code</a> he said exploits the vulnerability to crash any server hosting the game. Here's how it works.</p>
<blockquote>
<p>The vulnerability stems from the fact that the client is allowed to send the server information about certain slots. This, coupled with the NBT format’s nesting allows us to <em>craft</em> a packet that is incredibly complex for the server to deserialize but trivial for us to generate.</p>
<p>In my case, I chose to create lists within lists, down to five levels. This is a json representation of what it looks like.</p>
<div><pre><code data-lang="javascript"><span>rekt</span><span>:</span> <span>{</span>
<div>
<pre><code data-lang="javascript"><span>rekt</span><span>:</span> <span>{</span>
<span>list</span><span>:</span> <span>[</span>
<span>list</span><span>:</span> <span>[</span>
<span>list</span><span>:</span> <span>[</span>
Expand All @@ -34,7 +37,7 @@
<span>...</span>
<span>]</span>
<span>...</span>
<span>}</span></code></pre></div>
<span>}</span></code></pre> </div>
<p>The root of the object, <code>rekt</code>, contains 300 lists. Each list has a list with 10 sublists, and each of those sublists has 10 of their own, up until 5 levels of recursion. That’s a total of <code>10^5 * 300 = 30,000,000</code> lists.</p>
<p>And this isn’t even the theoretical maximum for this attack. Just the nbt data for this payload is 26.6 megabytes. But luckily Minecraft implements a way to compress large packets, lucky us! zlib shrinks down our evil data to a mere 39 kilobytes.</p>
<p>Note: in previous versions of Minecraft, there was no protocol wide compression for big packets. Previously, NBT was sent compressed with gzip and prefixed with a signed short of its length, which reduced our maximum payload size to <code>2^15 - 1</code>. Now that the length is a varint capable of storing integers up to <code>2^28</code>, our potential for attack has increased significantly.</p>
Expand All @@ -45,4 +48,4 @@
</blockquote>
<p>Ars is asking Mojang for comment and will update this post if company officials respond.</p>
</div>
</div>
</div>
2 changes: 1 addition & 1 deletion test/test-pages/blogger/expected-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
"title": "Open Verilog flow for Silego GreenPak4 programmable logic devices",
"byline": null,
"dir": "ltr",
"excerpt": "I've written a couple of posts in the past few months but they were all for",
"excerpt": "I've written a couple of posts in the past few months but they were all for the blog at work so I figured I'm long overdue for one on Silicon Exposed.",
"readerable": true
}
3 changes: 1 addition & 2 deletions test/test-pages/blogger/expected.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<div id="readability-page-1" class="page">
<div id="post-body-932306423056216142" itemprop="description articleBody">
<p style="display: inline;" class="readability-styled"> I've written a couple of posts in the past few months but they were all for </p><a href="http://blog.ioactive.com/search/label/Andrew%20Zonenberg">the blog at work</a>
<p style="display: inline;" class="readability-styled"> so I figured I'm long overdue for one on Silicon Exposed.</p>
<p> I've written a couple of posts in the past few months but they were all for <a href="http://blog.ioactive.com/search/label/Andrew%20Zonenberg">the blog at work</a> so I figured I'm long overdue for one on Silicon Exposed.</p>
<p>
<h2> So what's a GreenPak?</h2> <br/> Silego Technology is a fabless semiconductor company located in the SF Bay area, which makes (among other things) a line of programmable logic devices known as GreenPak. Their <a href="http://www.silego.com/products/greenpak5.html">5th generation parts</a> were just announced, but I started this project before that happened so I'm still targeting the <a href="http://www.silego.com/products/greenpak4.html">4th generation</a>.</p>
<p> GreenPak devices are kind of like itty bitty <a href="http://www.cypress.com/products/32-bit-arm-cortex-m-psoc">PSoCs</a> - they have a mixed signal fabric with an ADC, DACs, comparators, voltage references, plus a digital LUT/FF fabric and some typical digital MCU peripherals like counters and oscillators (but no CPU).</p>
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/breitbart/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?'",
"byline": "by Lucas Nolan22 Dec 2016651",
"dir": null,
"dir": "ltr",
"excerpt": "Snopes fact checker and staff writer David Emery posted to Twitter asking if there were “any un-angry Trump supporters?”",
"readerable": true
}
3 changes: 2 additions & 1 deletion test/test-pages/breitbart/expected.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
<div id="readability-page-1" class="page">
<div>
<figure>
<div><img itemprop="image" src="http://media.breitbart.com/media/2016/11/GettyImages-621866810-640x480.jpg" alt="Supporters of Republican presidential nominee Donald Trump cheer during election night at the New York Hilton Midtown in New York on November 9, 2016. / AFP / JIM WATSON (Photo credit should read JIM WATSON/AFP/Getty Images)" width="640" height="480" />
<div>
<p><img itemprop="image" src="http://media.breitbart.com/media/2016/11/GettyImages-621866810-640x480.jpg" alt="Supporters of Republican presidential nominee Donald Trump cheer during election night at the New York Hilton Midtown in New York on November 9, 2016. / AFP / JIM WATSON (Photo credit should read JIM WATSON/AFP/Getty Images)" width="640" height="480" /></p>
<p>JIM WATSON/AFP/Getty Images</p>
</div>
</figure> <time datetime="2016-12-22T10:43:37Z">22 Dec, 2016</time> <time datetime="2016-12-22T18:59:12Z">22 Dec, 2016</time> </div>
Expand Down
24 changes: 13 additions & 11 deletions test/test-pages/bug-1255978/expected.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
<p>Here are some of the secrets that the receptionist will never tell you when you check in, according to answers posted on <a href="https://www.quora.com/What-are-the-things-we-dont-know-about-hotel-rooms" target="_blank">Quora</a>.</p>
<div>
<div>
<div><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2014/03/18/10/bandb2.jpg" alt="bandb2.jpg" title="bandb2.jpg" width="564" height="423" /></div>
<p><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2014/03/18/10/bandb2.jpg" alt="bandb2.jpg" title="bandb2.jpg" width="564" height="423" /></p>
</div>
<p> Even posh hotels might not wash a blanket in between stays </p>
<p>Even posh hotels might not wash a blanket in between stays </p>
</div>
<p>1. Take any blankets or duvets off the bed</p>
<p>Forrest Jones said that anything that comes into contact with any of the previous guest’s skin should be taken out and washed every time the room is made, but that even the fanciest hotels don’t always do so. "Hotels are getting away from comforters. Blankets are here to stay, however. But some hotels are still hesitant about washing them every day if they think they can get out of it," he said.</p>
Expand All @@ -18,25 +18,25 @@
</div>
<div>
<div>
<div><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2015/05/26/11/hotel-door-getty.jpg" alt="hotel-door-getty.jpg" title="hotel-door-getty.jpg" width="564" height="423" /></div>
<p><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2015/05/26/11/hotel-door-getty.jpg" alt="hotel-door-getty.jpg" title="hotel-door-getty.jpg" width="564" height="423" /></p>
</div>
<p> Forrest Jones advised stuffing the peep hole with a strip of rolled up notepaper when not in use. </p>
<p>Forrest Jones advised stuffing the peep hole with a strip of rolled up notepaper when not in use. </p>
</div>
<p>2. Check the peep hole has not been tampered with</p>
<p>This is not common, but can happen, Forrest Jones said. He advised stuffing the peep hole with a strip of rolled up notepaper when not in use. When someone knocks on the door, the paper can be removed to check who is there. If no one is visible, he recommends calling the front desk immediately. “I look forward to the day when I can tell you to choose only hotels where every employee who has access to guestroom keys is subjected to a complete public records background check, prior to hire, and every year or two thereafter. But for now, I can't,” he said.</p>
<div>
<div>
<div><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2013/07/31/15/luggage-3.jpg" alt="luggage-3.jpg" title="luggage-3.jpg" width="564" height="423" /></div>
<p><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2013/07/31/15/luggage-3.jpg" alt="luggage-3.jpg" title="luggage-3.jpg" width="564" height="423" /></p>
</div>
<p> Put luggage on the floor </p>
<p>Put luggage on the floor </p>
</div>
<p>3. Don’t use a wooden luggage rack</p>
<p>Bedbugs love wood. Even though a wooden luggage rack might look nicer and more expensive than a metal one, it’s a breeding ground for bugs. Forrest Jones says guests should put the items they plan to take from bags on other pieces of furniture and leave the bag on the floor.</p>
<div>
<div>
<div><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2015/04/13/11/Lifestyle-hotels.jpg" alt="Lifestyle-hotels.jpg" title="Lifestyle-hotels.jpg" width="564" height="423" /></div>
<p><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2015/04/13/11/Lifestyle-hotels.jpg" alt="Lifestyle-hotels.jpg" title="Lifestyle-hotels.jpg" width="564" height="423" /></p>
</div>
<p> The old rule of thumb is that for every 00 invested in a room, the hotel should charge in average daily rate </p>
<p>The old rule of thumb is that for every 00 invested in a room, the hotel should charge in average daily rate </p>
</div>
<p>4. Hotel rooms are priced according to how expensive they were to build</p>
<p>Zeev Sharon said that the old rule of thumb is that for every $1000 invested in a room, the hotel should charge $1 in average daily rate. So a room that cost $300,000 to build, should sell on average for $300/night.</p>
Expand All @@ -53,9 +53,9 @@ <h3>6. Mini bars almost always lose money</h3>
<p>Despite the snacks in the minibar seeming like the most overpriced food you have ever seen, hotel owners are still struggling to make a profit from those snacks. "Minibars almost always lose money, even when they charge $10 for a Diet Coke,” Sharon said.</p>
<div>
<div>
<div><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2014/03/13/16/agenda7.jpg" alt="agenda7.jpg" title="agenda7.jpg" width="564" height="423" /></div>
<p><img src="https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2014/03/13/16/agenda7.jpg" alt="agenda7.jpg" title="agenda7.jpg" width="564" height="423" /></p>
</div>
<p> Towels should always be cleaned between stays </p>
<p>Towels should always be cleaned between stays </p>
</div>
<p>7. Always made sure the hand towels are clean when you arrive</p>
<p>Forrest Jones made a discovery when he was helping out with the housekeepers. “You know where you almost always find a hand towel in any recently-vacated hotel room that was occupied by a guy? On the floor, next to the bed, about halfway down, maybe a little toward the foot of the bed. Same spot in the floor, next to almost every bed occupied by a man, in every room. I'll leave the rest to your imagination,” he said.</p>
Expand All @@ -64,5 +64,7 @@ <h3>6. Mini bars almost always lose money</h3>
<li> More about: </li>
<li><a itemprop="keywords" href="http://fakehost/topic/Hotels">Hotels</a></li>
<li><a itemprop="keywords" href="http://fakehost/topic/Hygiene">Hygiene</a></li>
</ul> <a href="http://fakehost/syndication/reuse-permision-form?url=http://www.independent.co.uk/news/business/news/seven-secrets-that-hotel-owners-dont-want-you-to-know-10506160.html" target="_blank"><img src="http://fakehost/sites/all/themes/ines_themes/independent_theme/img/reuse.png" width="25"/>Reuse content</a> </div>
</ul>
<p><a href="http://fakehost/syndication/reuse-permision-form?url=http://www.independent.co.uk/news/business/news/seven-secrets-that-hotel-owners-dont-want-you-to-know-10506160.html" target="_blank"><img src="http://fakehost/sites/all/themes/ines_themes/independent_theme/img/reuse.png" width="25"/>Reuse content</a> </p>
</div>
</div>
Loading

0 comments on commit 9f2c5cb

Please sign in to comment.