Skip to content

Commit 280c10e

Browse files
committed
feat(web-scraper): add support for configurable list of extra modules allowed in extractor scripts
1 parent 226c366 commit 280c10e

File tree

7 files changed

+83
-5
lines changed

7 files changed

+83
-5
lines changed

components/retrack-web-scraper/src/api/status/get.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ await test('[/api/status] returns version from the config', async () => {
2222
},
2323
},
2424
server: { bodyLimit: 5 * 1024 * 1024 },
25-
25+
extractorSandbox: { extraAllowedModules: [] },
2626
port: 3,
2727
};
2828
const response = await registerStatusGetRoutes(createMock({ config: configMock })).inject({

components/retrack-web-scraper/src/api/web_page/constants.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { RemoteBrowserConfig } from '../../config.js';
1+
import type { ExtractorSandboxConfig, RemoteBrowserConfig } from '../../config.js';
22

33
/**
44
* Default timeout for the extractor script, in ms.
@@ -37,6 +37,8 @@ export interface WorkerResultMessage {
3737
export interface WorkerData {
3838
// The browser config that the worker should connect Playwright to.
3939
browserConfig: RemoteBrowserConfig;
40+
// The configuration for the extractor sandbox.
41+
extractorSandboxConfig: ExtractorSandboxConfig;
4042
// The extractor script that the worker should execute.
4143
extractor: string;
4244
// The parameters that should be passed to the extractor script.

components/retrack-web-scraper/src/api/web_page/execute.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import * as assert from 'node:assert/strict';
22
import { test, beforeEach, afterEach } from 'node:test';
3+
import { configure } from '../../config.js';
34
import { createBrowserServerMock } from '../../mocks.js';
45

56
import { registerExecuteRoutes } from './execute.js';
@@ -162,6 +163,39 @@ export async function execute(page) {
162163
assert.strictEqual(response.statusCode, 200);
163164
});
164165

166+
await test('[/api/web_page/execute] allows extractor scripts to import configured extra modules', async (t) => {
167+
t.mock.method(Date, 'now', () => 123000);
168+
169+
const response = await registerExecuteRoutes(
170+
createMock({
171+
wsEndpoint: browserServerMock.endpoint,
172+
config: {
173+
...configure(),
174+
extractorSandbox: { extraAllowedModules: ['node:fs'] },
175+
browser: { chromium: { protocol: 'cdp', backend: 'chromium', wsEndpoint: browserServerMock.endpoint } },
176+
},
177+
}),
178+
).inject({
179+
method: 'POST',
180+
url: '/api/web_page/execute',
181+
payload: {
182+
extractor: `
183+
export async function execute() {
184+
await import('timers');
185+
await import('node:fs');
186+
return 'OK';
187+
};
188+
`
189+
.replaceAll('\n', '')
190+
.trim(),
191+
tags: [],
192+
},
193+
});
194+
195+
assert.strictEqual(response.body, JSON.stringify('OK'));
196+
assert.strictEqual(response.statusCode, 200);
197+
});
198+
165199
await test('[/api/web_page/execute] allows extractor scripts to import `data:` modules', async (t) => {
166200
t.mock.method(Date, 'now', () => 123000);
167201

components/retrack-web-scraper/src/api/web_page/execute.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ export function registerExecuteRoutes({ config, server, getLocalBrowserServer }:
119119
const workerLog = logger.child({ provider: 'worker' });
120120
const workerData: WorkerData = {
121121
browserConfig: remoteBrowserConfig,
122+
extractorSandboxConfig: config.extractorSandbox,
122123
extractor: request.body.extractor,
123124
extractorParams: request.body.extractorParams,
124125
tags: request.body.tags,

components/retrack-web-scraper/src/api/web_page/extractor_module_hooks.ts

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,31 @@
1-
import type { ResolveFnOutput, ResolveHookContext } from 'module';
1+
import type { ResolveFnOutput, ResolveHookContext, InitializeHook } from 'module';
2+
import type { ExtractorSandboxConfig } from '../../config.js';
23

34
// This set contains the modules that are allowed to be imported by extractor scripts.
4-
const EXTRACTOR_MODULE_ALLOWLIST = new Set(['node:util', 'stream', 'stream/promises']);
5+
const EXTRACTOR_MODULE_ALLOWLIST = new Set([
6+
'node:stream',
7+
'node:stream/promises',
8+
'stream',
9+
'stream/promises',
10+
11+
'node:timers',
12+
'node:timers/promises',
13+
'timers',
14+
'timers/promises',
15+
16+
'node:util',
17+
'util',
18+
]);
19+
20+
// The initialize hook provides a way to define a custom function that runs in
21+
// the hooks thread when the hooks module is initialized. Initialization happens
22+
// when the hooks module is registered via `register`.
23+
export const initialize: InitializeHook<ExtractorSandboxConfig> = async ({ extraAllowedModules }) => {
24+
// Add extra allowed modules to the allowlist.
25+
for (const module of extraAllowedModules) {
26+
EXTRACTOR_MODULE_ALLOWLIST.add(module);
27+
}
28+
};
529

630
// This hook is called whenever a module is resolved, allowing you to intercept the resolution process and prevent
731
// extractor scripts from importing modules. This is useful for preventing extractor scripts from accessing sensitive

components/retrack-web-scraper/src/api/web_page/worker.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { register } from 'node:module';
33
import { pathToFileURL } from 'node:url';
44
import { resolve } from 'node:path';
55
import type { Browser, Page } from 'playwright-core';
6+
import type { ExtractorSandboxConfig } from '../../config.js';
67

78
import { Diagnostics } from '../diagnostics.js';
89
import type { WorkerData } from './constants.js';
@@ -17,6 +18,7 @@ if (!parentPort) {
1718
// Load the extractor script as an ES module.
1819
const {
1920
browserConfig,
21+
extractorSandboxConfig,
2022
extractor,
2123
extractorParams,
2224
tags,
@@ -50,7 +52,9 @@ for (const Class of [
5052

5153
// SECURITY: We load custom hooks to prevent extractor scripts from importing sensitive native and playwright modules.
5254
// See https://github.com/nodejs/node/issues/47747 for more details.
53-
register(resolve(import.meta.dirname, './extractor_module_hooks.js'), pathToFileURL('./'));
55+
register<ExtractorSandboxConfig>(resolve(import.meta.dirname, './extractor_module_hooks.js'), pathToFileURL('./'), {
56+
data: extractorSandboxConfig,
57+
});
5458
const extractorModule = (await import(`data:text/javascript,${encodeURIComponent(extractor)}`)) as {
5559
execute: (page: Page, context: { tags: string[]; params?: unknown; previousContent?: unknown }) => Promise<unknown>;
5660
};

components/retrack-web-scraper/src/config.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export interface Config {
88
isDev: boolean;
99
logLevel: string;
1010
browser: { screenshotsPath?: string; chromium?: BrowserConfig; firefox?: BrowserConfig };
11+
extractorSandbox: ExtractorSandboxConfig;
1112
userAgent?: string;
1213
server: { bodyLimit: number };
1314
}
@@ -39,6 +40,13 @@ export interface LocalBrowserConfig {
3940
chromiumSandbox: boolean;
4041
}
4142

43+
// Represents the configuration for the extractor sandbox.
44+
export interface ExtractorSandboxConfig {
45+
// The list of Node.js modules that are allowed to be imported in the
46+
// extractor scripts in addition to those that are allowed by default.
47+
extraAllowedModules: string[];
48+
}
49+
4250
export function configure(): Config {
4351
return {
4452
version: pkg.version,
@@ -76,5 +84,10 @@ export function configure(): Config {
7684
bodyLimit: +(process.env.RETRACK_WEB_SCRAPER_SERVER_BODY_LIMIT ?? 0) || 5 * 1024 * 1024,
7785
},
7886
userAgent: process.env.RETRACK_WEB_SCRAPER_USER_AGENT,
87+
extractorSandbox: {
88+
extraAllowedModules: process.env.RETRACK_WEB_SCRAPER_EXTRACTOR_SANDBOX_EXTRA_ALLOWED_MODULES
89+
? process.env.RETRACK_WEB_SCRAPER_EXTRACTOR_SANDBOX_EXTRA_ALLOWED_MODULES.split(',').map((m) => m.trim())
90+
: [],
91+
},
7992
};
8093
}

0 commit comments

Comments
 (0)