From 5a69d4a8eb5247b3b0638b100b34a45650359cb0 Mon Sep 17 00:00:00 2001 From: Daniel Aleksandersen Date: Sat, 25 Aug 2018 01:28:00 +0200 Subject: [PATCH] Improve metadata extraction (#478) * Improve metadata extraction * Recognize meta[property] as a space-separated list * Recognize Dulin Core (dc|dcterm): metadata. * Prefer Dublin Core, Open Graph, Twitter, and HTML in that order. * _getArticleTitle() is now only used as fallback if document doesn't provide good metadata. --- Readability.js | 78 ++++++++++--------- test/test-pages/002/expected-metadata.json | 2 +- .../expected-metadata.json | 7 ++ .../003-metadata-preferred/expected.html | 20 +++++ .../003-metadata-preferred/source.html | 45 +++++++++++ .../expected-metadata.json | 7 ++ .../expected.html | 20 +++++ .../source.html | 35 +++++++++ test/test-pages/bbc-1/expected-metadata.json | 4 +- .../breitbart/expected-metadata.json | 2 +- .../bug-1255978/expected-metadata.json | 2 +- .../buzzfeed-1/expected-metadata.json | 4 +- test/test-pages/ehow-1/expected-metadata.json | 4 +- test/test-pages/ehow-2/expected-metadata.json | 4 +- .../engadget/expected-metadata.json | 4 +- test/test-pages/heise/expected-metadata.json | 2 +- .../herald-sun-1/expected-metadata.json | 2 +- test/test-pages/iab-1/expected-metadata.json | 2 +- test/test-pages/ietf-1/expected-metadata.json | 4 +- .../keep-images/expected-metadata.json | 2 +- .../la-nacion/expected-metadata.json | 4 +- .../medium-1/expected-metadata.json | 2 +- .../medium-2/expected-metadata.json | 2 +- .../medium-3/expected-metadata.json | 4 +- test/test-pages/medium-3/source.html | 4 +- .../mozilla-1/expected-metadata.json | 2 +- .../test-pages/salon-1/expected-metadata.json | 2 +- .../simplyfound-1/expected-metadata.json | 2 +- test/test-pages/tumblr/expected-metadata.json | 2 +- .../wordpress/expected-metadata.json | 2 +- .../test-pages/yahoo-2/expected-metadata.json | 2 +- .../test-pages/yahoo-3/expected-metadata.json | 2 +- .../test-pages/yahoo-4/expected-metadata.json | 4 +- 33 files changed, 211 insertions(+), 73 deletions(-) create mode 100644 test/test-pages/003-metadata-preferred/expected-metadata.json create mode 100644 test/test-pages/003-metadata-preferred/expected.html create mode 100644 test/test-pages/003-metadata-preferred/source.html create mode 100644 test/test-pages/004-metadata-space-separated-properties/expected-metadata.json create mode 100644 test/test-pages/004-metadata-space-separated-properties/expected.html create mode 100644 test/test-pages/004-metadata-space-separated-properties/source.html diff --git a/Readability.js b/Readability.js index a59551ef..b93fec1d 100644 --- a/Readability.js +++ b/Readability.js @@ -1198,62 +1198,66 @@ Readability.prototype = { var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); - // Match "description", or Twitter's "twitter:description" (Cards) - // in name attribute. - var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/i; + // property is a space-separated list of values + var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi; - // Match Facebook's Open Graph title & description properties. - var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/i; + // name is a single value + var namePattern = /^\s*(?:(dc|dcterm|og|twitter)\s*[\.:]\s*)?(author|creator|description|title)\s*$/i; // Find description tags. this._forEachNode(metaElements, function(element) { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); + var content = element.getAttribute("content"); + var matches = null; + var name = null; - if ([elementName, elementProperty].indexOf("author") !== -1) { - metadata.byline = element.getAttribute("content"); - return; + if (elementProperty) { + matches = elementProperty.match(propertyPattern); + if (matches) { + for (var i = matches.length - 1; i >= 0; i--) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[i].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); + } + } } - - var name = null; - if (namePattern.test(elementName)) { + if (!matches && elementName && namePattern.test(elementName)) { name = elementName; - } else if (propertyPattern.test(elementProperty)) { - name = elementProperty; - } - - if (name) { - var content = element.getAttribute("content"); if (content) { - // Convert to lowercase and remove any whitespace - // so we can match below. - name = name.toLowerCase().replace(/\s/g, ""); + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); values[name] = content.trim(); } } }); - if ("description" in values) { - metadata.excerpt = values["description"]; - } else if ("og:description" in values) { - // Use facebook open graph description. - metadata.excerpt = values["og:description"]; - } else if ("twitter:description" in values) { - // Use twitter cards description. - metadata.excerpt = values["twitter:description"]; - } + // get title + metadata.title = values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["title"] || + values["twitter:title"]; - metadata.title = this._getArticleTitle(); if (!metadata.title) { - if ("og:title" in values) { - // Use facebook open graph title. - metadata.title = values["og:title"]; - } else if ("twitter:title" in values) { - // Use twitter cards title. - metadata.title = values["twitter:title"]; - } + metadata.title = this._getArticleTitle(); } + // get author + metadata.byline = values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; + + // get description + metadata.excerpt = values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["description"] || + values["twitter:description"]; + return metadata; }, diff --git a/test/test-pages/002/expected-metadata.json b/test/test-pages/002/expected-metadata.json index 9b020e4f..d7b95e8c 100644 --- a/test/test-pages/002/expected-metadata.json +++ b/test/test-pages/002/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog", + "title": "This API is so Fetching!", "byline": "Nikhil Marathe", "excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...", "readerable": true diff --git a/test/test-pages/003-metadata-preferred/expected-metadata.json b/test/test-pages/003-metadata-preferred/expected-metadata.json new file mode 100644 index 00000000..055541d7 --- /dev/null +++ b/test/test-pages/003-metadata-preferred/expected-metadata.json @@ -0,0 +1,7 @@ +{ + "title": "Dublin Core property title", + "byline": "Dublin Core property author", + "dir": null, + "excerpt": "Dublin Core property description", + "readerable": true +} diff --git a/test/test-pages/003-metadata-preferred/expected.html b/test/test-pages/003-metadata-preferred/expected.html new file mode 100644 index 00000000..dced8c9c --- /dev/null +++ b/test/test-pages/003-metadata-preferred/expected.html @@ -0,0 +1,20 @@ +
+
+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
+
diff --git a/test/test-pages/003-metadata-preferred/source.html b/test/test-pages/003-metadata-preferred/source.html new file mode 100644 index 00000000..c7275b3b --- /dev/null +++ b/test/test-pages/003-metadata-preferred/source.html @@ -0,0 +1,45 @@ + + + + + Title Element + + + + + + + + + + + + + + + + + + + +
+

Test document title

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
+ + \ No newline at end of file diff --git a/test/test-pages/004-metadata-space-separated-properties/expected-metadata.json b/test/test-pages/004-metadata-space-separated-properties/expected-metadata.json new file mode 100644 index 00000000..14f2917d --- /dev/null +++ b/test/test-pages/004-metadata-space-separated-properties/expected-metadata.json @@ -0,0 +1,7 @@ +{ + "title": "Preferred title", + "byline": "Creator Name", + "dir": null, + "excerpt": "Preferred description", + "readerable": true +} diff --git a/test/test-pages/004-metadata-space-separated-properties/expected.html b/test/test-pages/004-metadata-space-separated-properties/expected.html new file mode 100644 index 00000000..dced8c9c --- /dev/null +++ b/test/test-pages/004-metadata-space-separated-properties/expected.html @@ -0,0 +1,20 @@ +
+
+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
+
diff --git a/test/test-pages/004-metadata-space-separated-properties/source.html b/test/test-pages/004-metadata-space-separated-properties/source.html new file mode 100644 index 00000000..9b951008 --- /dev/null +++ b/test/test-pages/004-metadata-space-separated-properties/source.html @@ -0,0 +1,35 @@ + + + + + Title Element + + + + + + + + + +
+

Test document title

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+
+ + \ No newline at end of file diff --git a/test/test-pages/bbc-1/expected-metadata.json b/test/test-pages/bbc-1/expected-metadata.json index 028bc58c..f97a468b 100644 --- a/test/test-pages/bbc-1/expected-metadata.json +++ b/test/test-pages/bbc-1/expected-metadata.json @@ -1,6 +1,6 @@ { - "title": "Obama admits US gun laws are his 'biggest frustration'", + "title": "Obama admits US gun laws are his 'biggest frustration' - BBC News", "byline": null, - "excerpt": "President Barack Obama tells the BBC his failure to pass", + "excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.", "readerable": true } diff --git a/test/test-pages/breitbart/expected-metadata.json b/test/test-pages/breitbart/expected-metadata.json index a29bbbba..fb65fbe5 100644 --- a/test/test-pages/breitbart/expected-metadata.json +++ b/test/test-pages/breitbart/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?'", + "title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart", "byline": "by Lucas Nolan22 Dec 2016651", "dir": "ltr", "excerpt": "Snopes fact checker and staff writer David Emery posted to Twitter asking if there were “any un-angry Trump supporters?”", diff --git a/test/test-pages/bug-1255978/expected-metadata.json b/test/test-pages/bug-1255978/expected-metadata.json index 7df5a410..77b16b41 100644 --- a/test/test-pages/bug-1255978/expected-metadata.json +++ b/test/test-pages/bug-1255978/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "The seven secrets that hotel owners don't want you to know", + "title": "Seven secrets that hotel owners don't want you to know", "byline": "Hazel Sheffield", "dir": null, "excerpt": "Most people go to hotels for the pleasure of sleeping in a giant bed with clean white sheets and waking up to fresh towels in the morning. But those towels and sheets might not be as clean as they look, according to the hotel bosses that responded to an online thread about the things hotel owners don’t want you to know.", diff --git a/test/test-pages/buzzfeed-1/expected-metadata.json b/test/test-pages/buzzfeed-1/expected-metadata.json index 8f171df0..18f6009d 100644 --- a/test/test-pages/buzzfeed-1/expected-metadata.json +++ b/test/test-pages/buzzfeed-1/expected-metadata.json @@ -1,6 +1,6 @@ { "title": "Student Dies After Diet Pills She Bought Online \"Burned Her Up From Within\"", - "byline": "Mark Di Stefano", - "excerpt": "An inquest into Eloise Parry's death has been adjourned until July...", + "byline": null, + "excerpt": "An inquest into Eloise Parry's death has been adjourned until July.", "readerable": true } diff --git a/test/test-pages/ehow-1/expected-metadata.json b/test/test-pages/ehow-1/expected-metadata.json index dfe10d2f..575786bf 100644 --- a/test/test-pages/ehow-1/expected-metadata.json +++ b/test/test-pages/ehow-1/expected-metadata.json @@ -1,7 +1,7 @@ { - "title": "How to Build a Terrarium (with Pictures)", + "title": "How to Build a Terrarium | eHow", "byline": "Lucy Akins", "dir": null, - "excerpt": "How to Build a Terrarium. Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You...", + "excerpt": "Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You won’t have to water the...", "readerable": true } diff --git a/test/test-pages/ehow-2/expected-metadata.json b/test/test-pages/ehow-2/expected-metadata.json index eb5baa0d..171a50a4 100644 --- a/test/test-pages/ehow-2/expected-metadata.json +++ b/test/test-pages/ehow-2/expected-metadata.json @@ -1,7 +1,7 @@ { - "title": "How to Throw a Graduation Party on a Budget (with Pictures)", + "title": "How to Throw a Graduation Party on a Budget | eHow", "byline": "Gina Roberts-Grey", "dir": null, - "excerpt": "How to Throw a Graduation Party on a Budget. Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. They’re also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food....", + "excerpt": "Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. They’re also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food. However that budget was based on Midwestern...", "readerable": true } diff --git a/test/test-pages/engadget/expected-metadata.json b/test/test-pages/engadget/expected-metadata.json index 2927bace..b984e5ae 100644 --- a/test/test-pages/engadget/expected-metadata.json +++ b/test/test-pages/engadget/expected-metadata.json @@ -1,7 +1,7 @@ { - "title": "Xbox One X review: A console that keeps up with gaming PCs", + "title": "Xbox One X review: A console that keeps up with gaming PCs", "byline": null, "dir": null, - "excerpt": "The Xbox One X is the ultimate video game system. It sports more horsepower than any system ever. And it plays more titles in native 4K than Sony's PlayStation...", + "excerpt": "The Xbox One X is the most powerful gaming console ever, but it's not for everyone yet.", "readerable": true } diff --git a/test/test-pages/heise/expected-metadata.json b/test/test-pages/heise/expected-metadata.json index 48a5655d..6ca3bc45 100644 --- a/test/test-pages/heise/expected-metadata.json +++ b/test/test-pages/heise/expected-metadata.json @@ -1,6 +1,6 @@ { "title": "1Password für Mac generiert Einmal-Passwörter", - "byline": null, + "byline": "Mac & i", "excerpt": "Das in der iOS-Version bereits enthaltene TOTP-Feature ist nun auch für OS X 10.10 verfügbar. Zudem gibt es neue Zusatzfelder in der Datenbank und weitere Verbesserungen.", "readerable": true } diff --git a/test/test-pages/herald-sun-1/expected-metadata.json b/test/test-pages/herald-sun-1/expected-metadata.json index 53802a2e..fcbe2586 100644 --- a/test/test-pages/herald-sun-1/expected-metadata.json +++ b/test/test-pages/herald-sun-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Angry media won’t buckle over new surveillance laws\n\t\t\t\t\t\t| Herald Sun", + "title": "Angry media won’t buckle over new surveillance laws", "byline": "JOE HILDEBRAND", "dir": null, "excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed.", diff --git a/test/test-pages/iab-1/expected-metadata.json b/test/test-pages/iab-1/expected-metadata.json index 8c95f5e3..9ddd8502 100644 --- a/test/test-pages/iab-1/expected-metadata.json +++ b/test/test-pages/iab-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Getting LEAN with Digital Ad UX", + "title": "Getting LEAN with Digital Ad UX | IAB", "byline": "By\n\t\t\tScott Cunningham", "excerpt": "We messed up. As technologists, tasked with delivering content and services to users, we lost track of the user experience. Twenty years ago we saw an explosion of websites, built by developers around the world, providing all forms of content. This was the beginning of an age of enlightenment, the intersection of content and technology. … Continued", "readerable": true diff --git a/test/test-pages/ietf-1/expected-metadata.json b/test/test-pages/ietf-1/expected-metadata.json index b8349d8e..21944b6b 100644 --- a/test/test-pages/ietf-1/expected-metadata.json +++ b/test/test-pages/ietf-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "draft-dejong-remotestorage-04 - remoteStorage", - "byline": "AUTHORING", + "title": "remoteStorage", + "byline": "Jong, Michiel de", "readerable": true } diff --git a/test/test-pages/keep-images/expected-metadata.json b/test/test-pages/keep-images/expected-metadata.json index a62d0ab3..19339906 100644 --- a/test/test-pages/keep-images/expected-metadata.json +++ b/test/test-pages/keep-images/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Inside the Deep Web Drug Lab — Backchannel — Medium", + "title": "Inside the Deep Web Drug Lab", "byline": "Joseph Cox", "excerpt": "Welcome to DoctorX’s Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…", "readerable": true diff --git a/test/test-pages/la-nacion/expected-metadata.json b/test/test-pages/la-nacion/expected-metadata.json index 355e30d8..012fef14 100644 --- a/test/test-pages/la-nacion/expected-metadata.json +++ b/test/test-pages/la-nacion/expected-metadata.json @@ -1,6 +1,6 @@ { - "title": "Una solución no violenta para la cuestión mapuche - 07.12.2017", + "title": "Una solución no violenta para la cuestión mapuche", "byline": null, - "excerpt": "Una solución no violenta para la cuestión mapuche | Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla - LA NACION", + "excerpt": "Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla", "readerable": true } diff --git a/test/test-pages/medium-1/expected-metadata.json b/test/test-pages/medium-1/expected-metadata.json index 232a0676..ecd6650e 100644 --- a/test/test-pages/medium-1/expected-metadata.json +++ b/test/test-pages/medium-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Better Student Journalism — Medium", + "title": "The Open Journalism Project: Better Student Journalism", "byline": "Pippin Lee", "excerpt": "We pushed out the first version of the Open Journalism site in January. Here’s what we’ve learned about student journali…", "readerable": true diff --git a/test/test-pages/medium-2/expected-metadata.json b/test/test-pages/medium-2/expected-metadata.json index dade1f44..2eaaccb9 100644 --- a/test/test-pages/medium-2/expected-metadata.json +++ b/test/test-pages/medium-2/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "On Behalf of “Literally” — Medium", + "title": "On Behalf of “Literally”", "byline": "Courtney Kirchoff", "excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…", "readerable": true diff --git a/test/test-pages/medium-3/expected-metadata.json b/test/test-pages/medium-3/expected-metadata.json index ec8dc2b7..1b678964 100644 --- a/test/test-pages/medium-3/expected-metadata.json +++ b/test/test-pages/medium-3/expected-metadata.json @@ -1,7 +1,7 @@ { - "title": "Samantha and The Great Big Lie – John C. Welch – Medium", + "title": "Samantha and The Great Big Lie", "byline": "John C. Welch", "dir": null, - "excerpt": "(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…", + "excerpt": "How to get shanked doing what people say they want", "readerable": true } diff --git a/test/test-pages/medium-3/source.html b/test/test-pages/medium-3/source.html index 6e3d0856..37c24641 100644 --- a/test/test-pages/medium-3/source.html +++ b/test/test-pages/medium-3/source.html @@ -2216,10 +2216,10 @@

Samantha and The Great Big Lie – Medium - + - + diff --git a/test/test-pages/mozilla-1/expected-metadata.json b/test/test-pages/mozilla-1/expected-metadata.json index 63a36994..77ad2640 100644 --- a/test/test-pages/mozilla-1/expected-metadata.json +++ b/test/test-pages/mozilla-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Firefox — Customize and make it your own — The most flexible browser on\n the Web — Mozilla", + "title": "Firefox — Customize and make it your own — The most flexible browser on the Web", "byline": null, "dir": "ltr", "excerpt": "It’s easier than ever to personalize Firefox and make it work the way\n you do.\n No other browser gives you so much choice and flexibility.", diff --git a/test/test-pages/salon-1/expected-metadata.json b/test/test-pages/salon-1/expected-metadata.json index 02cb6646..1f725357 100644 --- a/test/test-pages/salon-1/expected-metadata.json +++ b/test/test-pages/salon-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech\n and libertarians", + "title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech and libertarians", "byline": "Joanna Rothkopf", "excerpt": "Disruptive companies talk a good game about sharing. Uber's really just an under-regulated company making riches", "readerable": true diff --git a/test/test-pages/simplyfound-1/expected-metadata.json b/test/test-pages/simplyfound-1/expected-metadata.json index 1210bda1..75d9562b 100644 --- a/test/test-pages/simplyfound-1/expected-metadata.json +++ b/test/test-pages/simplyfound-1/expected-metadata.json @@ -1,6 +1,6 @@ { "title": "Raspberry Pi 3 - The credit card sized PC that cost only $35 - All-time bestselling computer in UK", "byline": null, - "excerpt": "The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect. In a short four-year period they have grown to over sixty full-time employees and ha...", + "excerpt": "The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect. In a short four-year period they have grown to over sixty full-time employees and ha...", "readerable": true } diff --git a/test/test-pages/tumblr/expected-metadata.json b/test/test-pages/tumblr/expected-metadata.json index 6801046e..32a19f94 100644 --- a/test/test-pages/tumblr/expected-metadata.json +++ b/test/test-pages/tumblr/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Minecraft 1.8 - The Bountiful Update - Minecraft 1.8 - The Bountiful Update", + "title": "Minecraft 1.8 - The Bountiful Update", "byline": null, "dir": null, "excerpt": "+ Added Granite, Andesite, and Diorite stone blocks, with smooth versions\n+ Added Slime Block\n+ Added Iron Trapdoor\n+ Added Prismarine and Sea Lantern blocks\n+ Added the Ocean Monument\n+ Added Red...", diff --git a/test/test-pages/wordpress/expected-metadata.json b/test/test-pages/wordpress/expected-metadata.json index 2ff3a4f9..ef79ecd2 100644 --- a/test/test-pages/wordpress/expected-metadata.json +++ b/test/test-pages/wordpress/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Stack Overflow Jobs Data Shows ReactJS Skills in High Demand, WordPress Market Oversaturated with Developers – WordPress Tavern", + "title": "Stack Overflow Jobs Data Shows ReactJS Skills in High Demand, WordPress Market Oversaturated with Developers", "byline": null, "dir": null, "excerpt": "Stack Overflow published its analysis of 2017 hiring trends based on the targeting options employers selected when posting to Stack Overflow Jobs. The report, which compares data from 200 companies…", diff --git a/test/test-pages/yahoo-2/expected-metadata.json b/test/test-pages/yahoo-2/expected-metadata.json index 03810e5e..f395e316 100644 --- a/test/test-pages/yahoo-2/expected-metadata.json +++ b/test/test-pages/yahoo-2/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Russia: Space ship malfunctions, breaks up over Siberia", + "title": "Yahoo News - Latest News & Headlines", "byline": "NATALIYA VASILYEVA", "excerpt": "The latest news and headlines from Yahoo! News. Get breaking news stories and in-depth coverage with videos and photos.", "readerable": true diff --git a/test/test-pages/yahoo-3/expected-metadata.json b/test/test-pages/yahoo-3/expected-metadata.json index b81ae3ec..ddb7c4bf 100644 --- a/test/test-pages/yahoo-3/expected-metadata.json +++ b/test/test-pages/yahoo-3/expected-metadata.json @@ -2,6 +2,6 @@ "title": "Veteran Wraps Baby in American Flag, Photo Sparks Controversy", "byline": "By GILLIAN MOHNEY\n March 11, 2015 3:46 PM", "dir": "ltr", - "excerpt": "From Yahoo: A photographer and Navy veteran is fighting back after a photo she posted to Facebook started an online backlash. Vanessa Hicks said she had no idea her photo would be considered controversial. The photo, from a military family’s newborn photo shoot, showed a newborn infant wrapped in an American flag held by his father, who was in his military uniform. Hicks, a Navy veteran herself and the wife of an active-duty Navy member, said her intention was to honor the flag as well as her clients, who wanted to incorporate their military service in the photo shoot.", + "excerpt": "A photographer and Navy veteran is fighting back after a photo she posted to Facebook started an online backlash. Vanessa Hicks said she had no idea her photo would be considered controversial. The photo, from a military family’s newborn photo shoot, showed a newborn infant wrapped in an American flag held by his father, who was in his military uniform. Hicks, a Navy veteran herself and the wife of an active-duty Navy member, said her intention was to honor the flag as well as her clients, who wanted to incorporate their military service in the photo shoot.", "readerable": true } diff --git a/test/test-pages/yahoo-4/expected-metadata.json b/test/test-pages/yahoo-4/expected-metadata.json index fbd86644..579077bc 100644 --- a/test/test-pages/yahoo-4/expected-metadata.json +++ b/test/test-pages/yahoo-4/expected-metadata.json @@ -1,7 +1,7 @@ { - "title": "トレンドマイクロ、公衆無線LANを安全に使うためのアプリ「フリーWi-Fiプロテクション」 (CNET Japan) - Yahoo!ニュース", + "title": "トレンドマイクロ、公衆無線LANを安全に使うためのアプリ「フリーWi-Fiプロテクション」(CNET Japan) - Yahoo!ニュース", "byline": null, "dir": null, - "excerpt": "トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレッ", + "excerpt": "トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレット - Yahoo!ニュース(CNET Japan)", "readerable": true }