Skip to content

Commit

Permalink
JSON-LD support (microlinkhq#173)
Browse files Browse the repository at this point in the history
* feat: provide parsed JSON-LD to rules

* feat: implement JSON-LD retrieval of date

* feat: implement JSON-LD retrieval of title

* feat: implement JSON-LD retrieval of publisher

* feat: implement JSON-LD retrieval of description

* feat: implement JSON-LD retrieval of image

* feat: implement JSON-LD retrieval of lang

* feat: add support for <meta itemprop="inLanguage">

* refactor: extract jsonld as helper

* build: sort order

* build: add more jsonld rules

* build: ensure to pass url

* build: move rule

low priority

* build: ensure to decode html entities

* build: setup better image rules

* test: update snapshot

* build: remove jsonLd variable

* docs: tweaks

* test: update snapshot
  • Loading branch information
plaa authored and Kikobeats committed May 31, 2019
1 parent fc423f0 commit 99897e8
Show file tree
Hide file tree
Showing 22 changed files with 3,671 additions and 1,564 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
max_line_length = 100
max_line_length = 80
indent_brace_style = 1TBS
spaces_around_operators = true
quote_type = auto
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
[![Dependency Status](https://david-dm.org/microlinkhq/metascraper.svg?path=packages/metascraper&style=flat-square)](https://david-dm.org/microlinkhq/metascraper?path=packages/metascraper)
[![NPM Status](https://img.shields.io/npm/dm/metascraper.svg?style=flat-square)](https://www.npmjs.org/package/metascraper)

> A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.
> A library to easily scrape metadata from an article on the web using Open Graph, JSON+LD, regular HTML metadata, and series of fallbacks.
## Table of Contents

Expand Down
7 changes: 4 additions & 3 deletions packages/metascraper-author/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use strict'

const { $filter, author } = require('@metascraper/helpers')
const { $jsonld, $filter, author } = require('@metascraper/helpers')

const REGEX_STRICT = /^\S+\s+\S+/

Expand All @@ -11,8 +11,8 @@ const REGEX_STRICT = /^\S+\s+\S+/
* @return {Function} wrapped
*/

const wrap = rule => ({ htmlDom }) => {
const value = rule(htmlDom)
const wrap = rule => ({ htmlDom, url }) => {
const value = rule(htmlDom, url)
return author(value)
}

Expand All @@ -34,6 +34,7 @@ const strict = rule => $ => {

module.exports = () => ({
author: [
wrap($jsonld('author.name')),
wrap($ => $('meta[name="author"]').attr('content')),
wrap($ => $('meta[property="author"]').attr('content')),
wrap($ => $('meta[property="article:author"]').attr('content')),
Expand Down
9 changes: 6 additions & 3 deletions packages/metascraper-date/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use strict'

const { date, $filter } = require('@metascraper/helpers')
const { date, $filter, $jsonld } = require('@metascraper/helpers')

/**
* Wrap a rule with validation and formatting logic.
Expand All @@ -9,8 +9,8 @@ const { date, $filter } = require('@metascraper/helpers')
* @return {Function} wrapped
*/

const wrap = rule => ({ htmlDom }) => {
const value = rule(htmlDom)
const wrap = rule => ({ htmlDom, url }) => {
const value = rule(htmlDom, url)
return date(value)
}

Expand All @@ -20,6 +20,9 @@ const wrap = rule => ({ htmlDom }) => {

module.exports = () => ({
date: [
wrap($jsonld('dateModified')),
wrap($jsonld('datePublished')),
wrap($jsonld('dateCreated')),
wrap($ => $('meta[property*="updated_time" i]').attr('content')),
wrap($ => $('meta[property*="modified_time" i]').attr('content')),
wrap($ => $('meta[property*="published_time" i]').attr('content')),
Expand Down
8 changes: 5 additions & 3 deletions packages/metascraper-description/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use strict'

const { $filter, description } = require('@metascraper/helpers')
const { $filter, $jsonld, description } = require('@metascraper/helpers')

/**
* Wrap a rule with validation and formatting logic.
Expand All @@ -9,8 +9,8 @@ const { $filter, description } = require('@metascraper/helpers')
* @return {Function} wrapped
*/

const wrap = rule => ({ htmlDom }) => {
const value = rule(htmlDom)
const wrap = rule => ({ htmlDom, url }) => {
const value = rule(htmlDom, url)
return description(value)
}

Expand All @@ -20,10 +20,12 @@ const wrap = rule => ({ htmlDom }) => {

module.exports = () => ({
description: [
wrap($jsonld('description')),
wrap($ => $('meta[property="og:description"]').attr('content')),
wrap($ => $('meta[name="twitter:description"]').attr('content')),
wrap($ => $('meta[name="description"]').attr('content')),
wrap($ => $('meta[itemprop="description"]').attr('content')),
wrap($jsonld('articleBody')),
wrap($ => $('#description').text()),
wrap($ => $filter($, $('[class*="content" i] > p'))),
wrap($ => $filter($, $('[class*="content" i] p')))
Expand Down
61 changes: 61 additions & 0 deletions packages/metascraper-helpers/__snapshots__/index.js.snap-shot
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
exports['object if JSON-LD is preset into the html 1'] = {
"@context": "http://schema.org",
"@type": "NewsArticle",
"mainEntityOfPage": "https://www.theverge.com/2017/11/16/16667366/tesla-semi-truck-announced-price-release-date-electric-self-driving",
"headline": "This is the Tesla Semi truck",
"description": "500 miles of range and more aerodynamic than a supercar",
"speakable": {
"@type": "SpeakableSpecification",
"xpath": [
"/html/head/title",
"/html/head/meta[@name='description']/@content"
]
},
"datePublished": "2017-11-16T23:47:07-05:00",
"dateModified": "2017-11-16T23:47:07-05:00",
"author": {
"@type": "Person",
"name": "Zac Estrada"
},
"publisher": {
"@type": "Organization",
"name": "The Verge",
"logo": {
"@type": "ImageObject",
"url": "https://cdn.vox-cdn.com/uploads/chorus_asset/file/13668586/google_amp.0.png",
"width": 600,
"height": 60
}
},
"about": {
"@type": "Event",
"name": "Tesla Semi Truck Event 2017",
"startDate": "2017-11-17T04:00:00+00:00",
"location": {
"@type": "Place",
"name": "Tesla Motors factory",
"address": "Hawthorne, California, USA"
}
},
"image": [
{
"@type": "ImageObject",
"url": "https://cdn.vox-cdn.com/thumbor/k8ssXKPAuRwxa1pKew982ZMgv0o=/1400x1400/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg",
"width": 1400,
"height": 1400
},
{
"@type": "ImageObject",
"url": "https://cdn.vox-cdn.com/thumbor/l6nkV8CkJIdUrJIzHFWUFc1zLRM=/1400x1050/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg",
"width": 1400,
"height": 1050
},
{
"@type": "ImageObject",
"url": "https://cdn.vox-cdn.com/thumbor/5Sqo6J73lBi1hwzEiKCQy6FLx3I=/1400x788/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg",
"width": 1400,
"height": 788
}
]
}

69 changes: 50 additions & 19 deletions packages/metascraper-helpers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const {
replace,
includes,
isString,
isArray,
trim,
flow,
chain,
Expand All @@ -23,6 +24,7 @@ const fileExtension = require('file-extension')
const { resolve: resolveUrl } = require('url')
const _normalizeUrl = require('normalize-url')
const smartquotes = require('smartquotes')
const { decodeHTML } = require('entities')
const mimeTypes = require('mime-types')
const chrono = require('chrono-node')
const truncate = require('truncate')
Expand All @@ -31,6 +33,7 @@ const toTitle = require('title')
const isUri = require('is-uri')
const { URL } = require('url')
const urlLib = require('url')
const mem = require('mem')

const VIDEO = 'video'
const AUDIO = 'audio'
Expand Down Expand Up @@ -113,9 +116,11 @@ const protocol = url => {
return protocol.replace(':', '')
}

const isMediaTypeUrl = (url, type, opts) => isUrl(url, opts) && isMediaTypeExtension(url, type)
const isMediaTypeUrl = (url, type, opts) =>
isUrl(url, opts) && isMediaTypeExtension(url, type)

const isMediaTypeExtension = (url, type) => eq(type, get(EXTENSIONS, extension(url)))
const isMediaTypeExtension = (url, type) =>
eq(type, get(EXTENSIONS, extension(url)))

const isMediaUrl = (url, opts) =>
isImageUrl(url, opts) || isVideoUrl(url, opts) || isAudioUrl(url, opts)
Expand Down Expand Up @@ -196,29 +201,55 @@ const isMime = (contentType, type) => {
return eq(type, get(EXTENSIONS, ext))
}

const jsonld = mem(
(url, $) => {
try {
return JSON.parse(
$('script[type="application/ld+json"]')
.first()
.contents()
.text()
)
} catch (err) {
return {}
}
},
{ cacheKey: url => url }
)

const $jsonld = propName => ($, url) => {
const json = jsonld(url, $)
const value = get(json, propName)
return isEmpty(value) ? value : decodeHTML(value)
}

module.exports = {
author,
title,
lang,
url,
description,
date,
$filter,
titleize,
$jsonld,
absoluteUrl,
sanetizeUrl,
author,
date,
description,
extension,
protocol,
publisher,
normalizeUrl,
isMime,
isUrl,
isMediaUrl,
isVideoUrl,
isArray,
isAudioExtension,
isAudioUrl,
isImageExtension,
isImageUrl,
isMediaExtension,
isMediaUrl,
isMime,
isString,
isUrl,
isVideoExtension,
isAudioExtension,
isImageExtension
isVideoUrl,
jsonld,
lang,
normalizeUrl,
protocol,
publisher,
sanetizeUrl,
title,
titleize,
url
}
4 changes: 4 additions & 0 deletions packages/metascraper-helpers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
"condense-whitespace": "~2.0.0",
"entities": "~1.1.2",
"file-extension": "~4.0.5",
"image-extensions": "~1.1.0",
"is-relative-url": "~3.0.0",
"is-uri": "~1.2.0",
"iso-639-3": "~1.2.0",
"isostring": "0.0.1",
"lodash": "~4.17.11",
"mem": "~5.0.0",
"mime-types": "~2.1.24",
"normalize-url": "~4.3.0",
"smartquotes": "~2.3.1",
Expand All @@ -35,9 +37,11 @@
"video-extensions": "~1.1.0"
},
"devDependencies": {
"cheerio": "latest",
"mocha": "latest",
"nyc": "latest",
"should": "latest",
"snap-shot": "latest",
"standard": "latest"
},
"engines": {
Expand Down
Loading

0 comments on commit 99897e8

Please sign in to comment.