From ea609edd50df902dac4b0fe37a83c88df7779257 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Mon, 3 Apr 2023 17:27:08 -0400 Subject: [PATCH] JSON Summary feature Implements #136 - Added a `Scoop.summary()` method for generating a summary object. Can be invoked at any point during capture. - Added a `--json-summary-output` CLI option for generating a summary at the end of the capture process and saving it to disk as JSON. --- Profile of the summary object: ```json /** * @typedef {Object} ScoopCaptureSummary * @property {int} state * @property {object} states - Possible values of Scoop.state * @property {string} targetUrl * @property {boolean} targetUrlIsWebPage * @property {ScoopOptions} options * @property {string} startedAt - ISO-formated date * @property {string[]} blockedRequests * @property {string[]} noArchiveUrls * @property {string[]} exchangeUrls * @property {?string} captureIp * @property {?string} userAgent */ ``` --- README.md | 9 ++++----- Scoop.js | 35 +++++++++++++++++++++++++++++++++++ Scoop.test.js | 9 +++++++++ bin/cli.js | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3c5c06b5..11b6953c 100644 --- a/README.md +++ b/README.md @@ -167,8 +167,9 @@ More info: https://github.com/harvard-lil/scoop Options: -v, --version Display Scoop and Scoop CLI version. - -o, --output Output path. (default: "./archive.wacz") + -o, --output Output path. (default: "[directory]/archive.wacz") -f, --format Output format. (choices: "warc", "warc-gzipped", "wacz", "wacz-with-raw", default: "wacz") + --json-summary-output If set, allows for saving a capture summary as JSON. Must be a path to .json file. --signing-url Authsign-compatible endpoint for signing WACZ file. --signing-token Authentication token to --signing-url, if needed. --screenshot Add screenshot step to capture? (choices: "true", "false", default: "true") @@ -177,8 +178,7 @@ Options: --capture-video-as-attachment Add capture video(s) as attachment(s) step to capture? (choices: "true", "false", default: "true") --capture-certificates-as-attachment Add capture certificate(s) as attachment(s) step to capture? (choices: "true", "false", default: "true") --provenance-summary Add provenance summary to capture? (choices: "true", "false", default: "true") - --attachments-bypass-limits If active, attachments will not count towards time and size constraints imposed on capture (--capture-timeout, --max--capture-size). - (choices: "true", "false", default: "true") + --attachments-bypass-limits If active, attachments will not count towards time and size constraints imposed on capture (--capture-timeout, --max--capture-size). (choices: "true", "false", default: "true") --capture-timeout Maximum time allocated to capture process before hard cut-off, in ms. (default: 60000) --load-timeout Max time Scoop will wait for the page to load, in ms. (default: 20000) --network-idle-timeout Max time Scoop will wait for the in-browser networking tasks to complete, in ms. (default: 20000) @@ -194,8 +194,7 @@ Options: --run-site-specific-behaviors Should Scoop run site-specific capture behaviors? (via: browsertrix-behaviors) (choices: "true", "false", default: "true") --headless Should Chrome run in headless mode? (choices: "true", "false", default: "true") --user-agent-suffix If provided, will be appended to Chrome's user agent. (default: "") - --blocklist If set, replaces Scoop's default list of url patterns and IP ranges Scoop should not capture. Coma-separated. Example: - "/https?://localhost/,0.0.0.0/8,10.0.0.0". + --blocklist If set, replaces Scoop's default list of url patterns and IP ranges Scoop should not capture. Coma-separated. Example: "/https?://localhost/,0.0.0.0/8,10.0.0.0". --intercepter ScoopIntercepter class to be used to intercept network exchanges. (default: "ScoopProxy") --proxy-host Hostname to be used by Scoop's HTTP proxy. (default: "localhost") --proxy-port Port to be used by Scoop's HTTP proxy. (default: 9000) diff --git a/Scoop.js b/Scoop.js index ea8bcde6..35055ceb 100644 --- a/Scoop.js +++ b/Scoop.js @@ -1366,4 +1366,39 @@ export class Scoop { async toWACZ (includeRaw = true, signingServer) { return await exporters.scoopToWACZ(this, includeRaw, signingServer) } + + /** + * @typedef {Object} ScoopCaptureSummary + * @property {int} state + * @property {object} states - Possible values of Scoop.state + * @property {string} targetUrl + * @property {boolean} targetUrlIsWebPage + * @property {ScoopOptions} options + * @property {string} startedAt - ISO-formated date + * @property {string[]} blockedRequests + * @property {string[]} noArchiveUrls + * @property {string[]} exchangeUrls + * @property {?string} captureIp + * @property {?string} userAgent + */ + + /** + * Generates and returns a summary of the current capture object, regardless of its state. + * @returns {ScoopCaptureSummary} + */ + async summary () { + return { + state: this.state, + states: Scoop.states, + targetUrl: this.url, + targetUrlIsWebPage: this.targetUrlIsWebPage, + options: this.options, + startedAt: this.startedAt, + blockedRequests: [], + noArchiveUrls: [], + captureIp: this.provenanceInfo?.captureIp, + userAgent: this.provenanceInfo?.userAgent, + exchangeUrls: this.exchanges.map(exchange => exchange.url) + } + } } diff --git a/Scoop.test.js b/Scoop.test.js index f74d874b..2e5f2705 100644 --- a/Scoop.test.js +++ b/Scoop.test.js @@ -88,6 +88,15 @@ await test('Scoop - capture of a web page.', async (t) => { assert(attachment.response.body.includes('')) }) + await t.test('Scoop.summary() returns a valid object', async (_t) => { + const capture = await Scoop.capture(`${URL}/test.html`, options) + const summary = await capture.summary() + assert(summary) + assert.equal(summary.targetUrl, capture.url) + assert.equal(summary.state, Scoop.states.COMPLETE) + assert.equal(summary.exchangeUrls.length, capture.exchanges.length) + }) + /* * TEARDOWN */ diff --git a/bin/cli.js b/bin/cli.js index b71d2c69..f35a9b5b 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -40,6 +40,12 @@ program.addOption( .default('wacz') ) +program.addOption( + new Option( + '--json-summary-output ', + 'If set, allows for saving a capture summary as JSON. Must be a path to .json file. ') +) + // // Signing // @@ -347,6 +353,17 @@ program.action(async (name, options, command) => { process.exit(1) } + // `options.jsonSummaryOutput`: if set, must be an accessible `.json` + if (options.jsonSummaryOutput) { + try { + assert(options.jsonSummaryOutput.endsWith('.json'), true) + await fs.access(path.dirname(options.jsonSummaryOutput)) + } catch (err) { + console.error('JSON summary path must end with .json and lead to a directory that exists.') + process.exit(1) + } + } + // Convert 'true' / 'false' strings to booleans. for (const [key, value] of Object.entries(options)) { if (value === 'true') { @@ -420,6 +437,21 @@ program.action(async (name, options, command) => { process.exit(1) } + // + // JSON summary (?) + // + if (options.jsonSummaryOutput) { + try { + const summary = JSON.stringify(await capture.summary(), null, 2) + await fs.writeFile(options.jsonSummaryOutput, summary) + capture.log.info(`${options.jsonSummaryOutput} saved to disk.`) + } catch (err) { + capture.log.trace(err) + capture.log.error(`Something went wrong while saving ${options.jsonSummaryOutput} to disk. Use --log-level trace for details.`) + process.exit(1) + } + } + process.exit(0) })