Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support loading custom behaviors from URLs and/or filepaths #707

Merged
merged 5 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile

RUN mkdir -p /app/behaviors

WORKDIR /crawls

# enable to test custom behaviors build (from browsertrix-behaviors)
Expand Down
8 changes: 5 additions & 3 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,11 @@ Options:
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
[string]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or pathsto a direct
ory of behavior files.
tw4l marked this conversation as resolved.
Show resolved Hide resolved
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
ess (for debugging) [boolean]
Expand Down
14 changes: 7 additions & 7 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import {
runWorkers,
} from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -510,7 +510,7 @@ export class Crawler {
}

if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
);
}
Expand Down Expand Up @@ -800,24 +800,24 @@ self.__bx_behaviors.selectMainBehavior();
});
}

loadCustomBehaviors(filename: string) {
async loadCustomBehaviors(sources: string[]) {
let str = "";

for (const { contents } of collectAllFileSources(filename, ".js")) {
for (const { contents } of await collectCustomBehaviors(sources)) {
str += `self.__bx_behaviors.load(${contents});\n`;
}

return str;
}

async checkBehaviorScripts(cdp: CDPSession) {
const filename = this.params.customBehaviors;
const sources = this.params.customBehaviors;

if (!filename) {
if (!sources) {
return;
}

for (const { path, contents } of collectAllFileSources(filename, ".js")) {
for (const { path, contents } of await collectCustomBehaviors(sources)) {
await this.browser.checkScript(cdp, path, contents);
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,10 @@ class ArgParser {

customBehaviors: {
describe:
"injects a custom behavior file or set of behavior files in a directory",
type: "string",
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
"to a directory of behavior files.",
tw4l marked this conversation as resolved.
Show resolved Hide resolved
type: "array",
default: [],
},

debugAccessRedis: {
Expand Down
104 changes: 82 additions & 22 deletions src/util/file_reader.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,83 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";

import { logger } from "./logger.js";

const MAX_DEPTH = 2;

export function collectAllFileSources(
// Add .ts to allowed extensions when we can support it
const ALLOWED_EXTS = [".js"];

export type FileSource = {
path: string;
contents: string;
};

export type FileSources = FileSource[];

export async function collectCustomBehaviors(
sources: string[],
): Promise<FileSources> {
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
const newSources = await collectLocalPathBehaviors(fileSource);
collectedSources.push(...newSources);
}
}

return collectedSources;
}

async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
"behavior",
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
);
}
return [];
}

async function collectLocalPathBehaviors(
fileOrDir: string,
ext?: string,
depth = 0,
): { path: string; contents: string }[] {
): Promise<FileSources> {
const resolvedPath = path.resolve(fileOrDir);

if (depth >= MAX_DEPTH) {
console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
logger.warn(
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
{},
"behavior",
);
return [];
}

const stat = fs.statSync(resolvedPath);
const stat = await fsp.stat(resolvedPath);

if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
Expand All @@ -30,24 +86,28 @@ export function collectAllFileSources(
];
}

if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath);
return files.reduce(
(acc: { path: string; contents: string }[], next: string) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
},
[],
const behaviors: FileSources = [];

const isDir = stat.isDirectory();

if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
"behavior",
);
}

if (depth === 0) {
console.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
);
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
behaviors.push(...newBehaviors);
}
}

return [];
return behaviors;
}

export async function getInfoString() {
Expand Down
44 changes: 40 additions & 4 deletions tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import child_process from "child_process";

test("test custom behaviors", async () => {
test("test custom behaviors from local filepath", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
);

const log = res.toString();

// custom behavior ran for example.com
// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

Expand All @@ -35,6 +35,42 @@ test("test custom behaviors", async () => {
).toBe(true);
});

test("test custom behavior from URL", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");

const log = res.toString();

expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);

expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test mixed custom behavior sources", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");

const log = res.toString();

// test custom behavior from url ran
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);

expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

// test custom behavior from local file ran
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test invalid behavior exit", async () => {
let status = 0;

Expand Down
2 changes: 1 addition & 1 deletion tests/custom-behaviors/custom.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class TestBehavior {
}

static isMatch() {
return window.location.origin === "https://example.com";
return window.location.origin === "https://specs.webrecorder.net";
}

async *run(ctx) {
Expand Down
Loading