Skip to content

Commit

Permalink
Remove partial support for noarchive detection
Browse files Browse the repository at this point in the history
`node-html-parser` dependency was also removed as a result.
  • Loading branch information
matteocargnelutti committed Aug 14, 2023
1 parent c0e69fd commit a5e736d
Show file tree
Hide file tree
Showing 7 changed files with 1 addition and 108 deletions.
2 changes: 0 additions & 2 deletions Scoop.js
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ export class Scoop {
* osVersion: ?string,
* cpuArchitecture: ?string,
* blockedRequests: Array.<{match: string, rule: string}>,
* noArchiveUrls: string[],
* certificates: Array.<{host: string, pem: string}>,
* ytDlpHash: string,
* cripHash: string,
Expand All @@ -164,7 +163,6 @@ export class Scoop {
*/
provenanceInfo = {
blockedRequests: [],
noArchiveUrls: [],
certificates: []
}

Expand Down
15 changes: 0 additions & 15 deletions assets/templates/provenance-summary.njk
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,6 @@
</section>
{% endif %}

{% if noArchiveUrls.length %}
<section>
<h2>Urls tagged with the "noarchive" directive</h2>

<p>The following urls returned an HTML document tagged with the <em>noarchive</em> directive.</p>

<ul>
{% for url in noArchiveUrls %}
<li>{{ url }}</li>
{% endfor %}
</ul>

</section>
{% endif %}

{% if certificates.length %}
<section>
<h2>SSL/TLS Certificates</h2>
Expand Down
55 changes: 0 additions & 55 deletions intercepters/ScoopIntercepter.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
import { strict as assert } from 'node:assert'
import { parse as parseHTML } from 'node-html-parser'

import { Scoop } from '../Scoop.js'
import { bodyToString } from '../utils/http.js'
import { ScoopProxyExchange } from '../exchanges/ScoopProxyExchange.js'

/**
* @class ScoopIntercepter
Expand Down Expand Up @@ -84,56 +79,6 @@ export class ScoopIntercepter {
return {}
}

/**
* Tries to find the "noarchive" directive in a given exchange.
* If found, keeps trace of match in `Scoop.provenanceInfo`.
*
* @param {ScoopExchange} exchange
* @returns {boolean} - `true` if request contained "noarchive"
*/
async checkExchangeForNoArchive (exchange) {
// Exit early if exchange is not a ScoopProxyExchange
if (exchange instanceof ScoopProxyExchange === false) {
return false
}

// Exit early if this isn't an HTML document
if (!exchange?.response?.bodyCombined ||
!exchange?.response?.headers?.get('content-type')?.toLowerCase().startsWith('text/html')) {
return false
}

// Handle deflate / gzip / brotly compression
const contentEncoding = exchange.response.headers.get('content-encoding')

let responseBody = null
try {
responseBody = await bodyToString(exchange.response.bodyCombined, contentEncoding)
} catch (err) {
this.capture.log.info(`Error while decompressing ${contentEncoding} body. Assuming "noarchive" directive is absent.`)
this.capture.log.trace(err)
return false
}

// Skip if "noarchive" cannot be found in the document
if (!responseBody.match(/noarchive/i)) {
return false
}

// Parse DOM and look for full "noarchive" meta.
try {
const dom = parseHTML(responseBody)
assert(dom.querySelector('[content*=\'noarchive\']'))
} catch {
return false
}

// If we reached this point: this exchange is "noarchive".
this.capture.log.warn(`${exchange.url} was tagged with the "noarchive" directive.`)
this.capture.provenanceInfo.noArchiveUrls.push(exchange.url)
return true
}

/**
* Checks whether the total byte length has exceeded
* the capture's limit and, if so, stops intercepting exchanges.
Expand Down
32 changes: 0 additions & 32 deletions intercepters/ScoopIntercepter.test.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import test from 'node:test'
import assert from 'node:assert/strict'

import { FIXTURES_PATH } from '../constants.js'
import { ScoopIntercepter } from './index.js'
import { Scoop } from '../Scoop.js'
import { ScoopProxyExchange } from '../exchanges/ScoopProxyExchange.js'

test('ScoopIntercepter constructor "capture" argument must be a Scoop instance.', async (_t) => {
for (const capture of [{}, [], true, 12, 'FOO', ['FOO'], () => {}]) {
Expand All @@ -19,36 +17,6 @@ test('ScoopIntercepter setup and teardown methods throw as not implemented.', as
assert.rejects(intercepter.teardown())
})

test('checkExchangeForNoArchive returns true when noarchive directive is present in exchange.', async (_t) => {
const capture = await Scoop.fromWACZ(`${FIXTURES_PATH}/noarchive.netlify.app.wacz`)
const intercepter = new ScoopIntercepter(capture)

// Exactly 1 ScoopProxyExchange in that capture bears the "noarchive" directive.
let noArchiveCount = 0

for (const exchange of capture.exchanges) {
if (exchange instanceof ScoopProxyExchange === false) {
continue
}

noArchiveCount += Number(await intercepter.checkExchangeForNoArchive(exchange))
}

assert.equal(noArchiveCount, 1)
})

test('checkExchangeForNoArchive returns false when noarchive directive is not present in exchange.', async (_t) => {
const capture = await Scoop.fromWACZ(`${FIXTURES_PATH}/example.com.wacz`)
const intercepter = new ScoopIntercepter(capture)

let noArchiveCount = 0
for (const exchange of capture.exchanges) {
noArchiveCount += Number(await intercepter.checkExchangeForNoArchive(exchange))
}

assert.equal(noArchiveCount, 0)
})

test('checkAndEnforceSizeLimit interrupts capture when size limit is reached.', async (_t) => {
const capture = new Scoop('https://example.com')
const intercepter = new ScoopIntercepter(capture)
Expand Down
3 changes: 1 addition & 2 deletions intercepters/ScoopProxy.js
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ export class ScoopProxy extends ScoopIntercepter {

/**
* On response:
* - Check for "noarchive" directive
* - Parse response
* @param {http.ServerResponse} response
* @param {http.ClientRequest} request
*/
Expand All @@ -157,7 +157,6 @@ export class ScoopProxy extends ScoopIntercepter {

if (exchange) {
exchange.responseParsed = response
response.on('end', () => this.checkExchangeForNoArchive(exchange))
}
}

Expand Down
1 change: 0 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
"get-os-info": "^1.0.2",
"loglevel": "^1.8.1",
"loglevel-plugin-prefix": "^0.8.4",
"node-html-parser": "^6.1.4",
"node-stream-zip": "^1.15.0",
"nunjucks": "^3.2.3",
"playwright": "^1.37.0",
Expand Down

0 comments on commit a5e736d

Please sign in to comment.