Skip to content

Commit

Permalink
Support loading custom behaviors from git repo (#717)
Browse files Browse the repository at this point in the history
Fixes #712 
- Also expands the existing documentation about behaviors and adds a test.
- Uses query arg for 'branch' and 'path' to specify git branch and subpath in repo, respectively.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
  • Loading branch information
tw4l and ikreymer authored Nov 14, 2024
1 parent ea05307 commit 60c84b3
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 7 deletions.
37 changes: 33 additions & 4 deletions docs/docs/user-guide/behaviors.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,41 @@ To disable behaviors for a crawl, use `--behaviors ""`.

## Additional Custom Behaviors

Custom behaviors can be mounted into the crawler and loaded from there. For example:
Custom behaviors can be mounted into the crawler and ran from there, or downloaded from a URL.

Each behavior should contain a single class that implements the behavior interface. See [the behaviors tutorial](https://github.com/webrecorder/browsertrix-behaviors/blob/main/docs/TUTORIAL.md) for more info on how to write behaviors.

The first behavior which returns true for `isMatch()` will be run on a given page.

The repeatable `--customBehaviors` flag can accept:

- A path to a directory of behavior files
- A path to a single behavior file
- A URL for a single behavior file to download
- A URL for a git repository of the form `git+https://git.example.com/repo.git`, with optional query parameters `branch` (to specify a particular branch to use) and `path` (to specify a relative path to a directory within the git repository where the custom behaviors are located)

### Examples

#### Local filepath (directory)

```sh
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors /custom-behaviors/
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/
```

This will load all the custom behaviors stored in the `tests/custom-behaviors` directory. The first behavior which returns true for `isMatch()` will be run on a given page.
#### Local filepath (file)

Each behavior should contain a single class that implements the behavior interface. See [the behaviors tutorial](https://github.com/webrecorder/browsertrix-behaviors/blob/main/docs/TUTORIAL.md) for more info on how to write behaviors.
```sh
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/custom.js
```

#### URL

```sh
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors https://example.com/custom-behavior-1 --customBehaviors https://example.org/custom-behavior-2
```

#### Git repository

```sh
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors "git+https://git.example.com/custom-behaviors?branch=dev&path=path/to/behaviors"
```
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -564,8 +564,10 @@ class ArgParser {

customBehaviors: {
describe:
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
" of behaviors, URL to Git repo of behaviors (prefixed with git+, optionally specify branch and" +
" relative path to a directory within repo as branch and path query parameters, e.g." +
' --customBehaviors "git+https://git.example.com/repo.git?branch=dev&path=some/dir"',
type: "array",
default: [],
},
Expand Down
47 changes: 46 additions & 1 deletion src/util/file_reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";
import util from "util";
import { exec as execCallback } from "child_process";

import { logger } from "./logger.js";

const exec = util.promisify(execCallback);

const MAX_DEPTH = 2;

// Add .ts to allowed extensions when we can support it
Expand All @@ -23,7 +27,10 @@ export async function collectCustomBehaviors(
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
if (fileSource.startsWith("git+")) {
const newSources = await collectGitBehaviors(fileSource);
collectedSources.push(...newSources);
} else if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
Expand All @@ -35,6 +42,44 @@ export async function collectCustomBehaviors(
return collectedSources;
}

async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
const url = gitUrl.split("git+").pop() || "";
const params = new URL(url).searchParams;
const branch = params.get("branch") || "";
const relPath = params.get("path") || "";
const urlStripped = url.split("?")[0];

const tmpDir = `/tmp/behaviors-repo-${crypto.randomBytes(4).toString("hex")}`;

let cloneCommand = "git clone ";
if (branch) {
cloneCommand += `-b ${branch} --single-branch `;
}
cloneCommand += `${urlStripped} ${tmpDir}`;

let pathToCollect = tmpDir;
if (relPath) {
pathToCollect = path.join(tmpDir, relPath);
}

try {
await exec(cloneCommand);
logger.info(
"Custom behavior files downloaded from git repo",
{ url: urlStripped },
"behavior",
);
return await collectLocalPathBehaviors(pathToCollect);
} catch (e) {
logger.error(
"Error downloading custom behaviors from Git repo",
{ url: urlStripped, error: e },
"behavior",
);
}
return [];
}

async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;
Expand Down
35 changes: 35 additions & 0 deletions tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,41 @@ test("test mixed custom behavior sources", async () => {
).toBe(true);
});

test("test custom behaviors from git repo", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
);

const log = res.toString();

// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

// but not for example.org
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(false);

expect(
log.indexOf(
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(true);

// another custom behavior ran for old.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test invalid behavior exit", async () => {
let status = 0;

Expand Down

0 comments on commit 60c84b3

Please sign in to comment.