From dbfc0e06355923adfb5be978ec0a4eed77dbcd54 Mon Sep 17 00:00:00 2001 From: Gordon Date: Tue, 19 Jan 2021 09:36:08 -0600 Subject: [PATCH] Add jsonpath includes for json parsing --- package.json | 1 + src/parsers/json-parser.test.ts | 72 +++++++++++++++++++++++++++------ src/parsers/json-parser.ts | 58 ++++++++++++++++++++++---- yarn.lock | 5 +++ 4 files changed, 116 insertions(+), 20 deletions(-) diff --git a/package.json b/package.json index 415d0eb..b99e743 100644 --- a/package.json +++ b/package.json @@ -84,6 +84,7 @@ "cheerio": "^1.0.0-rc.3", "cross-fetch": "^3.0.2", "es6-promise": "^4.2.6", + "jsonpath-plus": "^5.0.2", "universal-url": "^2.0.0", "yargs": "^13.2.4" } diff --git a/src/parsers/json-parser.test.ts b/src/parsers/json-parser.test.ts index 1f2434a..2d122f9 100644 --- a/src/parsers/json-parser.test.ts +++ b/src/parsers/json-parser.test.ts @@ -8,7 +8,9 @@ import { JsonParser } from './json-parser' describe('JsonParser', () => { it('finds a URL in the json body', async () => { - const parser = new JsonParser() + const parser = new JsonParser({ + include: ["all"] + }) const { req, resp } = await makeResp( 'https://some-json.com', @@ -18,12 +20,15 @@ describe('JsonParser', () => { const results: URL[] = [] await parser.parse(resp, req, (result) => results.push(result)) - expect(results.length).to.eq(1) - expect(results[0].toString()).to.eq('https://google.com/') + expect(results.map((r) => r.toString())).to.deep.eq([ + 'https://google.com/' + ]) }) it('finds non-relative URLs in some json', async () => { - const parser = new JsonParser() + const parser = new JsonParser({ + include: ["all"] + }) const { req, resp } = await makeResp( 'https://some-json.com/some-path', @@ -32,12 +37,15 @@ describe('JsonParser', () => { const results: URL[] = [] await parser.parse(resp, req, (result) => results.push(result)) - expect(results.length).to.eq(1) - expect(results[0].toString()).to.eq('https://some-json.com/other-path') + expect(results.map((r) => r.toString())).to.deep.eq([ + 'https://some-json.com/other-path' + ]) }) it('handles protocol relative URLs', async () => { - const parser = new JsonParser() + const parser = new JsonParser({ + include: ["all"] + }) const { req, resp } = await makeResp( 'https://some-json.com/some-path', @@ -46,12 +54,15 @@ describe('JsonParser', () => { const results: URL[] = [] await parser.parse(resp, req, (result) => results.push(result)) - expect(results.length).to.eq(1) - expect(results[0].toString()).to.eq('https://images.ctfassets.net/asdf.png') + expect(results.map((r) => r.toString())).to.deep.eq([ + 'https://images.ctfassets.net/asdf.png' + ]) }) it('handles URLs in whitespace', async () => { - const parser = new JsonParser() + const parser = new JsonParser({ + include: ["all"] + }) const { req, resp } = await makeResp( 'https://some-json.com/some-path', @@ -60,8 +71,45 @@ describe('JsonParser', () => { const results: URL[] = [] await parser.parse(resp, req, (result) => results.push(result)) - expect(results.length).to.eq(1) - expect(results[0].toString()).to.eq('http://images.ctfassets.net/asdf.png') + + expect(results.map((r) => r.toString())).to.deep.eq([ + 'http://images.ctfassets.net/asdf.png' + ]) + }) + + it('by default scans only "links" and "_links" objects', async () => { + const data = { + data: { + slug: '/test-slug', + links: { self: "/some-rel-link" }, + _links: { + other: "/some-other-rel-link", + google: "https://www.google.com" + } + }, + _links: { + thirdLink: '/some-third-link' + } + } + + const parser = new JsonParser({ + }) + + const { req, resp } = await makeResp( + 'https://some-json.com/some-path', + JSON.stringify(data), + ) + const results: URL[] = [] + await parser.parse(resp, req, (result) => results.push(result)) + + // order doesn't matter in these results + const sorted = results.map(r => r.toString()).sort() + expect(sorted).to.deep.eq([ + 'https://some-json.com/some-other-rel-link', + 'https://some-json.com/some-rel-link', + 'https://some-json.com/some-third-link', + "https://www.google.com/", + ]) }) }) diff --git a/src/parsers/json-parser.ts b/src/parsers/json-parser.ts index 35a526c..2d31566 100644 --- a/src/parsers/json-parser.ts +++ b/src/parsers/json-parser.ts @@ -1,34 +1,44 @@ +import { JSONPath } from 'jsonpath-plus' import { ParserOptions } from '.' import { defaultLogger } from '../logger' import { URL } from '../url' import { parseUrl } from '../url' import { assign, Options } from '../util' +const defaultIncludes = [ + '$..links', + '$.._links', + '$..link', + '$.._link' +] + export class JsonParser { public static readonly regexp = /^\s*((((ftp|http|https):)?\/\/)|\/)[^ "<\{\}]+\s*$/igm private readonly _options: ParserOptions + private readonly _seen = new Set() constructor(options?: Options) { this._options = assign( { logger: defaultLogger, - include: [], + include: defaultIncludes, }, options, ) + + if (this._options.include.includes('all')) { + this._options.include = ['$..*'] + } } public async parse(response: Response, request: Request, push: (result: URL) => void): Promise { const baseUrl = response.url || request.url - for (const { value } of traverse(await response.json())) { - // do something here with each key and value - if (typeof value == 'string') { - if (value.match(JsonParser.regexp)) { - this._tryEmit(value, baseUrl, push) - } + for (const potentialLink of this.traverse(await response.json())) { + if (potentialLink.match(JsonParser.regexp)) { + this._tryEmit(potentialLink, baseUrl, push) } } } @@ -41,7 +51,39 @@ export class JsonParser { this._options.logger.debug(`bad href: '${match}'`) return } - push(url) + + if (!this._seen.has(url.toString())) { + this._seen.add(url.toString()) + push(url) + } + } + + private* traverse(json: any): Iterable { + for(const path of this._options.include) { + for (const obj of JSONPath({ path, json })) { + // directly selected strings + if (typeof obj == 'string') { + yield obj + + // arrays + } else if (typeof obj[Symbol.iterator] === 'function') { + for (const val of obj) { + if (typeof val == 'string') { + yield val + } + } + + // objects + } else if (typeof obj == 'object') { + for (const key of Object.keys(obj)) { + if (!obj.hasOwnProperty(key)) { continue } + if (typeof obj[key] != 'string') { continue } + + yield obj[key] + } + } + } + } } } diff --git a/yarn.lock b/yarn.lock index 56412a2..c115814 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4130,6 +4130,11 @@ jsonfile@^4.0.0: optionalDependencies: graceful-fs "^4.1.6" +jsonpath-plus@^5.0.2: + version "5.0.2" + resolved "https://registry.yarnpkg.com/jsonpath-plus/-/jsonpath-plus-5.0.2.tgz#ede88d064264172560ab1420a366a106cfab683d" + integrity sha512-J1StEInJIb5INbUkzf/DM6mby0hEyU2o6kw+AUzrJnrgMunvDKdZgGFXEqH5qA2TVF3mVH7A6ZZQJpcNXXg90A== + jsprim@^1.2.2: version "1.4.1" resolved "https://registry.yarnpkg.com/jsprim/-/jsprim-1.4.1.tgz#313e66bc1e5cc06e438bc1b7499c2e5c56acb6a2"