Skip to content

Commit

Permalink
Improve metadata extraction (mozilla#478)
Browse files Browse the repository at this point in the history
* Improve metadata extraction

* Recognize meta[property] as a space-separated list
* Recognize Dulin Core (dc|dcterm): metadata.
* Prefer Dublin Core, Open Graph, Twitter, and HTML in that order.
* _getArticleTitle() is now only used as fallback if document
 doesn't provide good metadata.
  • Loading branch information
da2x authored and gijsk committed Aug 24, 2018
1 parent 0449dbf commit 5a69d4a
Show file tree
Hide file tree
Showing 33 changed files with 211 additions and 73 deletions.
78 changes: 41 additions & 37 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -1198,62 +1198,66 @@ Readability.prototype = {
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");

// Match "description", or Twitter's "twitter:description" (Cards)
// in name attribute.
var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/i;
// property is a space-separated list of values
var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi;

// Match Facebook's Open Graph title & description properties.
var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/i;
// name is a single value
var namePattern = /^\s*(?:(dc|dcterm|og|twitter)\s*[\.:]\s*)?(author|creator|description|title)\s*$/i;

// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
var content = element.getAttribute("content");
var matches = null;
var name = null;

if ([elementName, elementProperty].indexOf("author") !== -1) {
metadata.byline = element.getAttribute("content");
return;
if (elementProperty) {
matches = elementProperty.match(propertyPattern);
if (matches) {
for (var i = matches.length - 1; i >= 0; i--) {
// Convert to lowercase, and remove any whitespace
// so we can match below.
name = matches[i].toLowerCase().replace(/\s/g, "");
// multiple authors
values[name] = content.trim();
}
}
}

var name = null;
if (namePattern.test(elementName)) {
if (!matches && elementName && namePattern.test(elementName)) {
name = elementName;
} else if (propertyPattern.test(elementProperty)) {
name = elementProperty;
}

if (name) {
var content = element.getAttribute("content");
if (content) {
// Convert to lowercase and remove any whitespace
// so we can match below.
name = name.toLowerCase().replace(/\s/g, "");
// Convert to lowercase, remove any whitespace, and convert dots
// to colons so we can match below.
name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
values[name] = content.trim();
}
}
});

if ("description" in values) {
metadata.excerpt = values["description"];
} else if ("og:description" in values) {
// Use facebook open graph description.
metadata.excerpt = values["og:description"];
} else if ("twitter:description" in values) {
// Use twitter cards description.
metadata.excerpt = values["twitter:description"];
}
// get title
metadata.title = values["dc:title"] ||
values["dcterm:title"] ||
values["og:title"] ||
values["title"] ||
values["twitter:title"];

metadata.title = this._getArticleTitle();
if (!metadata.title) {
if ("og:title" in values) {
// Use facebook open graph title.
metadata.title = values["og:title"];
} else if ("twitter:title" in values) {
// Use twitter cards title.
metadata.title = values["twitter:title"];
}
metadata.title = this._getArticleTitle();
}

// get author
metadata.byline = values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"];

// get description
metadata.excerpt = values["dc:description"] ||
values["dcterm:description"] ||
values["og:description"] ||
values["description"] ||
values["twitter:description"];

return metadata;
},

Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/002/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog",
"title": "This API is so Fetching!",
"byline": "Nikhil Marathe",
"excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...",
"readerable": true
Expand Down
7 changes: 7 additions & 0 deletions test/test-pages/003-metadata-preferred/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"title": "Dublin Core property title",
"byline": "Dublin Core property author",
"dir": null,
"excerpt": "Dublin Core property description",
"readerable": true
}
20 changes: 20 additions & 0 deletions test/test-pages/003-metadata-preferred/expected.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<div id="readability-page-1" class="page">
<article>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</div>
45 changes: 45 additions & 0 deletions test/test-pages/003-metadata-preferred/source.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Title Element</title>
<meta name="title" content="Meta name title"/>
<meta name="og:title" content="Open Graph name title"/>
<meta name="twitter:title" content="Twitter name title"/>
<meta name="DC.title" content="Dublin Core name title"/>
<meta property="dc:title" content="Dublin Core property title"/>
<meta property="twitter:title" content="Twitter property title"/>
<meta property="og:title" content="Open Graph property title"/>
<meta name="author" content="Meta name author"/>
<meta name="DC.creator" content="Dublin Core name author"/>
<meta property="dc:creator" content="Dublin Core property author"/>
<meta name="description" content="Meta name description"/>
<meta name="og:description" content="Open Graph name description"/>
<meta name="twitter:description" content="Twitter name description"/>
<meta name="DC.description" content="Dublin Core name description"/>
<meta property="dc:description" content="Dublin Core property description"/>
<meta property="twitter:description" content="Twitter property description"/>
<meta property="og:description" content="Open Graph property description"/>
</head>
<body>
<article>
<h1>Test document title</h1>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"title": "Preferred title",
"byline": "Creator Name",
"dir": null,
"excerpt": "Preferred description",
"readerable": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<div id="readability-page-1" class="page">
<article>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Title Element</title>
<meta property="x:title dc:title" content="Preferred title"/>
<meta property="og:title twitter:title" content="A title"/>
<meta property="dc:creator twitter:site_name" content="Creator Name"/>
<meta name="author" content="FAIL"/>
<meta property="og:description x:description twitter:description" content="A description"/>
<meta property="dc:description og:description" content="Preferred description"/>
<meta name="description" content="FAIL"/>
</head>
<body>
<article>
<h1>Test document title</h1>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</body>
</html>
4 changes: 2 additions & 2 deletions test/test-pages/bbc-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "Obama admits US gun laws are his 'biggest frustration'",
"title": "Obama admits US gun laws are his 'biggest frustration' - BBC News",
"byline": null,
"excerpt": "President Barack Obama tells the BBC his failure to pass",
"excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.",
"readerable": true
}
2 changes: 1 addition & 1 deletion test/test-pages/breitbart/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?'",
"title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart",
"byline": "by Lucas Nolan22 Dec 2016651",
"dir": "ltr",
"excerpt": "Snopes fact checker and staff writer David Emery posted to Twitter asking if there were “any un-angry Trump supporters?”",
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/bug-1255978/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "The seven secrets that hotel owners don't want you to know",
"title": "Seven secrets that hotel owners don't want you to know",
"byline": "Hazel Sheffield",
"dir": null,
"excerpt": "Most people go to hotels for the pleasure of sleeping in a giant bed with clean white sheets and waking up to fresh towels in the morning. But those towels and sheets might not be as clean as they look, according to the hotel bosses that responded to an online thread about the things hotel owners don’t want you to know.",
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/buzzfeed-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "Student Dies After Diet Pills She Bought Online \"Burned Her Up From Within\"",
"byline": "Mark Di Stefano",
"excerpt": "An inquest into Eloise Parry's death has been adjourned until July...",
"byline": null,
"excerpt": "An inquest into Eloise Parry's death has been adjourned until July.",
"readerable": true
}
4 changes: 2 additions & 2 deletions test/test-pages/ehow-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"title": "How to Build a Terrarium (with Pictures)",
"title": "How to Build a Terrarium | eHow",
"byline": "Lucy Akins",
"dir": null,
"excerpt": "How to Build a Terrarium. Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You...",
"excerpt": "Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You won’t have to water the...",
"readerable": true
}
4 changes: 2 additions & 2 deletions test/test-pages/ehow-2/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"title": "How to Throw a Graduation Party on a Budget (with Pictures)",
"title": "How to Throw a Graduation Party on a Budget | eHow",
"byline": "Gina Roberts-Grey",
"dir": null,
"excerpt": "How to Throw a Graduation Party on a Budget. Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. They’re also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food....",
"excerpt": "Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. They’re also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food. However that budget was based on Midwestern...",
"readerable": true
}
4 changes: 2 additions & 2 deletions test/test-pages/engadget/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"title": "Xbox One X review: A console that keeps up with gaming PCs",
"title": "Xbox One X review: A console that keeps up with gaming PCs",
"byline": null,
"dir": null,
"excerpt": "The Xbox One X is the ultimate video game system. It sports more horsepower than any system ever. And it plays more titles in native 4K than Sony's PlayStation...",
"excerpt": "The Xbox One X is the most powerful gaming console ever, but it's not for everyone yet.",
"readerable": true
}
2 changes: 1 addition & 1 deletion test/test-pages/heise/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "1Password für Mac generiert Einmal-Passwörter",
"byline": null,
"byline": "Mac & i",
"excerpt": "Das in der iOS-Version bereits enthaltene TOTP-Feature ist nun auch für OS X 10.10 verfügbar. Zudem gibt es neue Zusatzfelder in der Datenbank und weitere Verbesserungen.",
"readerable": true
}
2 changes: 1 addition & 1 deletion test/test-pages/herald-sun-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "Angry media won’t buckle over new surveillance laws\n\t\t\t\t\t\t| Herald Sun",
"title": "Angry media won’t buckle over new surveillance laws",
"byline": "JOE HILDEBRAND",
"dir": null,
"excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed.",
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/iab-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "Getting LEAN with Digital Ad UX",
"title": "Getting LEAN with Digital Ad UX | IAB",
"byline": "By\n\t\t\tScott Cunningham",
"excerpt": "We messed up. As technologists, tasked with delivering content and services to users, we lost track of the user experience. Twenty years ago we saw an explosion of websites, built by developers around the world, providing all forms of content. This was the beginning of an age of enlightenment, the intersection of content and technology. … Continued",
"readerable": true
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/ietf-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "draft-dejong-remotestorage-04 - remoteStorage",
"byline": "AUTHORING",
"title": "remoteStorage",
"byline": "Jong, Michiel de",
"readerable": true
}
2 changes: 1 addition & 1 deletion test/test-pages/keep-images/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "Inside the Deep Web Drug Lab — Backchannel — Medium",
"title": "Inside the Deep Web Drug Lab",
"byline": "Joseph Cox",
"excerpt": "Welcome to DoctorX’s Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…",
"readerable": true
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/la-nacion/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "Una solución no violenta para la cuestión mapuche - 07.12.2017",
"title": "Una solución no violenta para la cuestión mapuche",
"byline": null,
"excerpt": "Una solución no violenta para la cuestión mapuche | Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla - LA NACION",
"excerpt": "Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla",
"readerable": true
}
2 changes: 1 addition & 1 deletion test/test-pages/medium-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "Better Student Journalism — Medium",
"title": "The Open Journalism Project: Better Student Journalism",
"byline": "Pippin Lee",
"excerpt": "We pushed out the first version of the Open Journalism site in January. Here’s what we’ve learned about student journali…",
"readerable": true
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/medium-2/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "On Behalf of “Literally” — Medium",
"title": "On Behalf of “Literally”",
"byline": "Courtney Kirchoff",
"excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…",
"readerable": true
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/medium-3/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"title": "Samantha and The Great Big Lie – John C. Welch – Medium",
"title": "Samantha and The Great Big Lie",
"byline": "John C. Welch",
"dir": null,
"excerpt": "(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…",
"excerpt": "How to get shanked doing what people say they want",
"readerable": true
}
4 changes: 2 additions & 2 deletions test/test-pages/medium-3/source.html
Original file line number Diff line number Diff line change
Expand Up @@ -2216,10 +2216,10 @@ <h3 class="u-fontSizeBase u-lineHeightTighter u-marginBottom4"><a class="link li
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Samantha and The Great Big Lie – Medium</title>
<link rel="canonical" href="https://medium.com/@johncwelch/samantha-and-the-great-big-lie-d146a92473a1" />
<meta name="title" content="Samantha and The Great Big Lie – John C. Welch – Medium" />
<meta name="title" content="Samantha and The Great Big Lie" />
<meta name="referrer" content="always" />
<meta name="description" content="(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…" />
<meta property="og:title" content="Samantha and The Great Big Lie – John C. Welch – Medium" />
<meta property="og:title" content="Samantha and The Great Big Lie" />
<meta property="og:url" content="https://medium.com/@johncwelch/samantha-and-the-great-big-lie-d146a92473a1#.h9kzgon9m" />
<meta property="og:image" content="https://cdn-images-1.medium.com/max/1200/1*kbPh7V97eyRodSOw2-ALDw.png" />
<meta property="fb:app_id" content="542599432471018" />
Expand Down
Loading

0 comments on commit 5a69d4a

Please sign in to comment.