Skip to content

Commit

Permalink
Add jsonpath includes for json parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
gburgett committed Jan 19, 2021
1 parent 0f0cb2c commit dbfc0e0
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 20 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
"cheerio": "^1.0.0-rc.3",
"cross-fetch": "^3.0.2",
"es6-promise": "^4.2.6",
"jsonpath-plus": "^5.0.2",
"universal-url": "^2.0.0",
"yargs": "^13.2.4"
}
Expand Down
72 changes: 60 additions & 12 deletions src/parsers/json-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import { JsonParser } from './json-parser'

describe('JsonParser', () => {
it('finds a URL in the json body', async () => {
const parser = new JsonParser()
const parser = new JsonParser({
include: ["all"]
})

const { req, resp } = await makeResp(
'https://some-json.com',
Expand All @@ -18,12 +20,15 @@ describe('JsonParser', () => {
const results: URL[] = []
await parser.parse(resp, req, (result) => results.push(result))

expect(results.length).to.eq(1)
expect(results[0].toString()).to.eq('https://google.com/')
expect(results.map((r) => r.toString())).to.deep.eq([
'https://google.com/'
])
})

it('finds non-relative URLs in some json', async () => {
const parser = new JsonParser()
const parser = new JsonParser({
include: ["all"]
})

const { req, resp } = await makeResp(
'https://some-json.com/some-path',
Expand All @@ -32,12 +37,15 @@ describe('JsonParser', () => {
const results: URL[] = []
await parser.parse(resp, req, (result) => results.push(result))

expect(results.length).to.eq(1)
expect(results[0].toString()).to.eq('https://some-json.com/other-path')
expect(results.map((r) => r.toString())).to.deep.eq([
'https://some-json.com/other-path'
])
})

it('handles protocol relative URLs', async () => {
const parser = new JsonParser()
const parser = new JsonParser({
include: ["all"]
})

const { req, resp } = await makeResp(
'https://some-json.com/some-path',
Expand All @@ -46,12 +54,15 @@ describe('JsonParser', () => {
const results: URL[] = []
await parser.parse(resp, req, (result) => results.push(result))

expect(results.length).to.eq(1)
expect(results[0].toString()).to.eq('https://images.ctfassets.net/asdf.png')
expect(results.map((r) => r.toString())).to.deep.eq([
'https://images.ctfassets.net/asdf.png'
])
})

it('handles URLs in whitespace', async () => {
const parser = new JsonParser()
const parser = new JsonParser({
include: ["all"]
})

const { req, resp } = await makeResp(
'https://some-json.com/some-path',
Expand All @@ -60,8 +71,45 @@ describe('JsonParser', () => {
const results: URL[] = []
await parser.parse(resp, req, (result) => results.push(result))

expect(results.length).to.eq(1)
expect(results[0].toString()).to.eq('http://images.ctfassets.net/asdf.png')

expect(results.map((r) => r.toString())).to.deep.eq([
'http://images.ctfassets.net/asdf.png'
])
})

it('by default scans only "links" and "_links" objects', async () => {
const data = {
data: {
slug: '/test-slug',
links: { self: "/some-rel-link" },
_links: {
other: "/some-other-rel-link",
google: "https://www.google.com"
}
},
_links: {
thirdLink: '/some-third-link'
}
}

const parser = new JsonParser({
})

const { req, resp } = await makeResp(
'https://some-json.com/some-path',
JSON.stringify(data),
)
const results: URL[] = []
await parser.parse(resp, req, (result) => results.push(result))

// order doesn't matter in these results
const sorted = results.map(r => r.toString()).sort()
expect(sorted).to.deep.eq([
'https://some-json.com/some-other-rel-link',
'https://some-json.com/some-rel-link',
'https://some-json.com/some-third-link',
"https://www.google.com/",
])
})
})

Expand Down
58 changes: 50 additions & 8 deletions src/parsers/json-parser.ts
Original file line number Diff line number Diff line change
@@ -1,34 +1,44 @@

import { JSONPath } from 'jsonpath-plus'
import { ParserOptions } from '.'
import { defaultLogger } from '../logger'
import { URL } from '../url'
import { parseUrl } from '../url'
import { assign, Options } from '../util'

const defaultIncludes = [
'$..links',
'$.._links',
'$..link',
'$.._link'
]

export class JsonParser {
public static readonly regexp = /^\s*((((ftp|http|https):)?\/\/)|\/)[^ "<\{\}]+\s*$/igm

private readonly _options: ParserOptions
private readonly _seen = new Set<string>()

constructor(options?: Options<ParserOptions>) {
this._options = assign(
{
logger: defaultLogger,
include: [],
include: defaultIncludes,
},
options,
)

if (this._options.include.includes('all')) {
this._options.include = ['$..*']
}
}

public async parse(response: Response, request: Request, push: (result: URL) => void): Promise<void> {
const baseUrl = response.url || request.url

for (const { value } of traverse(await response.json())) {
// do something here with each key and value
if (typeof value == 'string') {
if (value.match(JsonParser.regexp)) {
this._tryEmit(value, baseUrl, push)
}
for (const potentialLink of this.traverse(await response.json())) {
if (potentialLink.match(JsonParser.regexp)) {
this._tryEmit(potentialLink, baseUrl, push)
}
}
}
Expand All @@ -41,7 +51,39 @@ export class JsonParser {
this._options.logger.debug(`bad href: '${match}'`)
return
}
push(url)

if (!this._seen.has(url.toString())) {
this._seen.add(url.toString())
push(url)
}
}

private* traverse(json: any): Iterable<string> {
for(const path of this._options.include) {
for (const obj of JSONPath({ path, json })) {
// directly selected strings
if (typeof obj == 'string') {
yield obj

// arrays
} else if (typeof obj[Symbol.iterator] === 'function') {
for (const val of obj) {
if (typeof val == 'string') {
yield val
}
}

// objects
} else if (typeof obj == 'object') {
for (const key of Object.keys(obj)) {
if (!obj.hasOwnProperty(key)) { continue }
if (typeof obj[key] != 'string') { continue }

yield obj[key]
}
}
}
}
}
}

Expand Down
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4130,6 +4130,11 @@ jsonfile@^4.0.0:
optionalDependencies:
graceful-fs "^4.1.6"

jsonpath-plus@^5.0.2:
version "5.0.2"
resolved "https://registry.yarnpkg.com/jsonpath-plus/-/jsonpath-plus-5.0.2.tgz#ede88d064264172560ab1420a366a106cfab683d"
integrity sha512-J1StEInJIb5INbUkzf/DM6mby0hEyU2o6kw+AUzrJnrgMunvDKdZgGFXEqH5qA2TVF3mVH7A6ZZQJpcNXXg90A==

jsprim@^1.2.2:
version "1.4.1"
resolved "https://registry.yarnpkg.com/jsprim/-/jsprim-1.4.1.tgz#313e66bc1e5cc06e438bc1b7499c2e5c56acb6a2"
Expand Down

0 comments on commit dbfc0e0

Please sign in to comment.