Skip to content

Commit

Permalink
Support loading custom behaviors from URLs and/or filepaths (#707)
Browse files Browse the repository at this point in the history
Fixes #368 

The `--customBehaviors` flag is now an array, making it repeatable. This
should be backwards compatible with the CLI flag, but may require
changes to YAML configs when custom behaviors are used.

Custom behaviors can be loaded from URLs, local filepaths, and paths to
local directories, including any combination thereof.

New tests are added to ensure loading behaviors from URLs as well as a
mixed combination of URL and filepath works as expected.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
  • Loading branch information
tw4l and ikreymer authored Nov 5, 2024
1 parent e5bab8e commit 2a9b152
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 40 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile

RUN mkdir -p /app/behaviors

WORKDIR /crawls

# enable to test custom behaviors build (from browsertrix-behaviors)
Expand Down
8 changes: 5 additions & 3 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,11 @@ Options:
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
[string]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direct
ory of behavior files.
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
ess (for debugging) [boolean]
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.3.4",
"version": "1.4.0-beta.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
14 changes: 7 additions & 7 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import {
runWorkers,
} from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -511,7 +511,7 @@ export class Crawler {
}

if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
);
}
Expand Down Expand Up @@ -801,24 +801,24 @@ self.__bx_behaviors.selectMainBehavior();
});
}

loadCustomBehaviors(filename: string) {
async loadCustomBehaviors(sources: string[]) {
let str = "";

for (const { contents } of collectAllFileSources(filename, ".js")) {
for (const { contents } of await collectCustomBehaviors(sources)) {
str += `self.__bx_behaviors.load(${contents});\n`;
}

return str;
}

async checkBehaviorScripts(cdp: CDPSession) {
const filename = this.params.customBehaviors;
const sources = this.params.customBehaviors;

if (!filename) {
if (!sources) {
return;
}

for (const { path, contents } of collectAllFileSources(filename, ".js")) {
for (const { path, contents } of await collectCustomBehaviors(sources)) {
await this.browser.checkScript(cdp, path, contents);
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,10 @@ class ArgParser {

customBehaviors: {
describe:
"injects a custom behavior file or set of behavior files in a directory",
type: "string",
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
type: "array",
default: [],
},

debugAccessRedis: {
Expand Down
104 changes: 82 additions & 22 deletions src/util/file_reader.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,83 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";

import { logger } from "./logger.js";

const MAX_DEPTH = 2;

export function collectAllFileSources(
// Add .ts to allowed extensions when we can support it
const ALLOWED_EXTS = [".js"];

export type FileSource = {
path: string;
contents: string;
};

export type FileSources = FileSource[];

export async function collectCustomBehaviors(
sources: string[],
): Promise<FileSources> {
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
const newSources = await collectLocalPathBehaviors(fileSource);
collectedSources.push(...newSources);
}
}

return collectedSources;
}

async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
"behavior",
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
);
}
return [];
}

async function collectLocalPathBehaviors(
fileOrDir: string,
ext?: string,
depth = 0,
): { path: string; contents: string }[] {
): Promise<FileSources> {
const resolvedPath = path.resolve(fileOrDir);

if (depth >= MAX_DEPTH) {
console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
logger.warn(
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
{},
"behavior",
);
return [];
}

const stat = fs.statSync(resolvedPath);
const stat = await fsp.stat(resolvedPath);

if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
Expand All @@ -30,24 +86,28 @@ export function collectAllFileSources(
];
}

if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath);
return files.reduce(
(acc: { path: string; contents: string }[], next: string) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
},
[],
const behaviors: FileSources = [];

const isDir = stat.isDirectory();

if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
"behavior",
);
}

if (depth === 0) {
console.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
);
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
behaviors.push(...newBehaviors);
}
}

return [];
return behaviors;
}

export async function getInfoString() {
Expand Down
44 changes: 40 additions & 4 deletions tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import child_process from "child_process";

test("test custom behaviors", async () => {
test("test custom behaviors from local filepath", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
);

const log = res.toString();

// custom behavior ran for example.com
// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

Expand All @@ -35,6 +35,42 @@ test("test custom behaviors", async () => {
).toBe(true);
});

test("test custom behavior from URL", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");

const log = res.toString();

expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);

expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test mixed custom behavior sources", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");

const log = res.toString();

// test custom behavior from url ran
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);

expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

// test custom behavior from local file ran
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test invalid behavior exit", async () => {
let status = 0;

Expand Down
2 changes: 1 addition & 1 deletion tests/custom-behaviors/custom.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class TestBehavior {
}

static isMatch() {
return window.location.origin === "https://example.com";
return window.location.origin === "https://specs.webrecorder.net";
}

async *run(ctx) {
Expand Down

0 comments on commit 2a9b152

Please sign in to comment.