Skip to content

Commit

Permalink
JSON Summary feature
Browse files Browse the repository at this point in the history
Implements #136

- Added a `Scoop.summary()` method for generating a summary object. Can be invoked at any point during capture.
- Added a `--json-summary-output` CLI option for generating a summary at the end of the capture process and saving it to disk as JSON.

---

Profile of the summary object:
```json
/**
 * @typedef {Object} ScoopCaptureSummary
 * @Property {int} state
 * @Property {object} states - Possible values of Scoop.state
 * @Property {string} targetUrl
 * @Property {boolean} targetUrlIsWebPage
 * @Property {ScoopOptions} options
 * @Property {string} startedAt - ISO-formated date
 * @Property {string[]} blockedRequests
 * @Property {string[]} noArchiveUrls
 * @Property {string[]} exchangeUrls
 * @Property {?string} captureIp
 * @Property {?string} userAgent
 */
```
  • Loading branch information
matteocargnelutti committed Apr 3, 2023
1 parent 642beb1 commit ea609ed
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 5 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ More info: https://github.com/harvard-lil/scoop
Options:
-v, --version Display Scoop and Scoop CLI version.
-o, --output <string> Output path. (default: "./archive.wacz")
-o, --output <string> Output path. (default: "[directory]/archive.wacz")
-f, --format <string> Output format. (choices: "warc", "warc-gzipped", "wacz", "wacz-with-raw", default: "wacz")
--json-summary-output <string> If set, allows for saving a capture summary as JSON. Must be a path to .json file.
--signing-url <string> Authsign-compatible endpoint for signing WACZ file.
--signing-token <string> Authentication token to --signing-url, if needed.
--screenshot <bool> Add screenshot step to capture? (choices: "true", "false", default: "true")
Expand All @@ -177,8 +178,7 @@ Options:
--capture-video-as-attachment <bool> Add capture video(s) as attachment(s) step to capture? (choices: "true", "false", default: "true")
--capture-certificates-as-attachment <bool> Add capture certificate(s) as attachment(s) step to capture? (choices: "true", "false", default: "true")
--provenance-summary <bool> Add provenance summary to capture? (choices: "true", "false", default: "true")
--attachments-bypass-limits <bool> If active, attachments will not count towards time and size constraints imposed on capture (--capture-timeout, --max--capture-size).
(choices: "true", "false", default: "true")
--attachments-bypass-limits <bool> If active, attachments will not count towards time and size constraints imposed on capture (--capture-timeout, --max--capture-size). (choices: "true", "false", default: "true")
--capture-timeout <number> Maximum time allocated to capture process before hard cut-off, in ms. (default: 60000)
--load-timeout <number> Max time Scoop will wait for the page to load, in ms. (default: 20000)
--network-idle-timeout <number> Max time Scoop will wait for the in-browser networking tasks to complete, in ms. (default: 20000)
Expand All @@ -194,8 +194,7 @@ Options:
--run-site-specific-behaviors <bool> Should Scoop run site-specific capture behaviors? (via: browsertrix-behaviors) (choices: "true", "false", default: "true")
--headless <bool> Should Chrome run in headless mode? (choices: "true", "false", default: "true")
--user-agent-suffix <string> If provided, will be appended to Chrome's user agent. (default: "")
--blocklist <string> If set, replaces Scoop's default list of url patterns and IP ranges Scoop should not capture. Coma-separated. Example:
"/https?://localhost/,0.0.0.0/8,10.0.0.0".
--blocklist <string> If set, replaces Scoop's default list of url patterns and IP ranges Scoop should not capture. Coma-separated. Example: "/https?://localhost/,0.0.0.0/8,10.0.0.0".
--intercepter <string> ScoopIntercepter class to be used to intercept network exchanges. (default: "ScoopProxy")
--proxy-host <string> Hostname to be used by Scoop's HTTP proxy. (default: "localhost")
--proxy-port <string> Port to be used by Scoop's HTTP proxy. (default: 9000)
Expand Down
35 changes: 35 additions & 0 deletions Scoop.js
Original file line number Diff line number Diff line change
Expand Up @@ -1366,4 +1366,39 @@ export class Scoop {
async toWACZ (includeRaw = true, signingServer) {
return await exporters.scoopToWACZ(this, includeRaw, signingServer)
}

/**
* @typedef {Object} ScoopCaptureSummary
* @property {int} state
* @property {object} states - Possible values of Scoop.state
* @property {string} targetUrl
* @property {boolean} targetUrlIsWebPage
* @property {ScoopOptions} options
* @property {string} startedAt - ISO-formated date
* @property {string[]} blockedRequests
* @property {string[]} noArchiveUrls
* @property {string[]} exchangeUrls
* @property {?string} captureIp
* @property {?string} userAgent
*/

/**
* Generates and returns a summary of the current capture object, regardless of its state.
* @returns {ScoopCaptureSummary}
*/
async summary () {
return {
state: this.state,
states: Scoop.states,
targetUrl: this.url,
targetUrlIsWebPage: this.targetUrlIsWebPage,
options: this.options,
startedAt: this.startedAt,
blockedRequests: [],
noArchiveUrls: [],
captureIp: this.provenanceInfo?.captureIp,
userAgent: this.provenanceInfo?.userAgent,
exchangeUrls: this.exchanges.map(exchange => exchange.url)
}
}
}
9 changes: 9 additions & 0 deletions Scoop.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ await test('Scoop - capture of a web page.', async (t) => {
assert(attachment.response.body.includes('<!DOCTYPE html>'))
})

await t.test('Scoop.summary() returns a valid object', async (_t) => {
const capture = await Scoop.capture(`${URL}/test.html`, options)
const summary = await capture.summary()
assert(summary)
assert.equal(summary.targetUrl, capture.url)
assert.equal(summary.state, Scoop.states.COMPLETE)
assert.equal(summary.exchangeUrls.length, capture.exchanges.length)
})

/*
* TEARDOWN
*/
Expand Down
32 changes: 32 additions & 0 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ program.addOption(
.default('wacz')
)

program.addOption(
new Option(
'--json-summary-output <string>',
'If set, allows for saving a capture summary as JSON. Must be a path to .json file. ')
)

//
// Signing
//
Expand Down Expand Up @@ -347,6 +353,17 @@ program.action(async (name, options, command) => {
process.exit(1)
}

// `options.jsonSummaryOutput`: if set, must be an accessible `.json`
if (options.jsonSummaryOutput) {
try {
assert(options.jsonSummaryOutput.endsWith('.json'), true)
await fs.access(path.dirname(options.jsonSummaryOutput))
} catch (err) {
console.error('JSON summary path must end with .json and lead to a directory that exists.')
process.exit(1)
}
}

// Convert 'true' / 'false' strings to booleans.
for (const [key, value] of Object.entries(options)) {
if (value === 'true') {
Expand Down Expand Up @@ -420,6 +437,21 @@ program.action(async (name, options, command) => {
process.exit(1)
}

//
// JSON summary (?)
//
if (options.jsonSummaryOutput) {
try {
const summary = JSON.stringify(await capture.summary(), null, 2)
await fs.writeFile(options.jsonSummaryOutput, summary)
capture.log.info(`${options.jsonSummaryOutput} saved to disk.`)
} catch (err) {
capture.log.trace(err)
capture.log.error(`Something went wrong while saving ${options.jsonSummaryOutput} to disk. Use --log-level trace for details.`)
process.exit(1)
}
}

process.exit(0)
})

Expand Down

0 comments on commit ea609ed

Please sign in to comment.