Skip to content

Commit 3eec08f

Browse files
TheSonOfThomptsck
andauthored
Ingest with recursive crawler (#2849)
* feat(ui): initialize chatbot UI with React and Vite (#2841) * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ingest): add initial configuration and data sources for chatbot ingestion * update configs * feat(ingest): implement data sources for LeafyGreen UI and MongoDB Chatbot Framework * chore: update pnpm workspace configuration to include 'apps/*' directory * feat(ingest): add data sources for LeafyGreen UI and MongoDB Chatbot Framework, and refactor Azure OpenAI embedder * feat(ingest): add MongoDB Design website data source and integrate into ingest configuration * Delete package-lock.json * reset ui * feat(crawler): export LoadedPageContents and ProcessSingleUrl interfaces * feat(crawler): refactor crawling logic to use processLangchainDocument and update allowedDomains format * feat(crawler): update recursive crawl logic to process documents with processLangchainDocument * feat(crawler): update package.json exports and refactor lodash imports * feat(crawler): implement createWebSourceConstructor and update ingest configuration * Update ingest.config.ts * feat(crawler): replace createWebSourceConstructor with webSourceConstructor and update data sources * fix(crawler): handle invalid URLs in newURL and recursive crawl logic * Resolve tsconfig.json --------- Co-authored-by: Terrence Keane <terrence.keane@mongodb.com>
1 parent 09f5aef commit 3eec08f

15 files changed

+272
-137
lines changed

apps/chatbot-server/ingest/ingest.config.ts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ import {
55
import { AzureOpenAI } from 'mongodb-rag-core/openai';
66
import { Config, makeIngestMetaStore } from 'mongodb-rag-ingest';
77

8-
import { leafygreenGithubSourceConstructor } from './sources/github-leafygreen-ui';
9-
import { mongoDbChatbotFrameworkDocsDataSourceConstructor } from './sources/github-mdb-chatbot-framework';
10-
import { createAzureEmbedderConstructor } from './utils/createAzureEmbedderConstructor';
11-
import { loadEnvVars } from './utils/loadEnv';
8+
import { leafygreenGithubSourceConstructor } from './sources/github-leafygreen-ui.js';
9+
import { mongoDbChatbotFrameworkDocsDataSourceConstructor } from './sources/github-mdb-chatbot-framework.js';
10+
import { createAzureEmbedderConstructor } from './utils/createAzureEmbedderConstructor.js';
11+
import { loadEnvVars } from './utils/loadEnv.js';
12+
import { webSourceConstructor } from './utils/webSourceConstructor.js';
1213

1314
// Load project environment variables
1415
const {
@@ -49,6 +50,15 @@ export default {
4950
// Add data sources here
5051
dataSources: async () => {
5152
return Promise.all([
53+
...[
54+
'https://mongodb.design',
55+
'https://react.dev/reference/react',
56+
'https://developer.mozilla.org/en-US/docs/Web',
57+
'https://css-tricks.com/category/articles',
58+
'https://www.nngroup.com/articles',
59+
'https://www.w3.org/WAI/standards-guidelines/wcag',
60+
'https://atomicdesign.bradfrost.com/table-of-contents',
61+
].map(source => webSourceConstructor(source, {})),
5262
mongoDbChatbotFrameworkDocsDataSourceConstructor(),
5363
leafygreenGithubSourceConstructor(),
5464
]);

apps/chatbot-server/ingest/tsconfig.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,9 @@
1616
"./ingest.config.ts"
1717
],
1818
"include": ["./**/*.ts"],
19+
"references": [
20+
{
21+
"path": "../../../tools/crawler"
22+
},
23+
]
1924
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* eslint-disable no-console */
2+
import { recursiveCrawlFromBaseURL } from '@lg-tools/crawler';
3+
import { Page } from 'mongodb-rag-core';
4+
import { type DataSource } from 'mongodb-rag-core/dataSources';
5+
6+
export interface WebSourceConstructorOptions {
7+
maxDepth?: number;
8+
verbose?: boolean;
9+
}
10+
11+
/**
12+
* Returns a constructor function
13+
* that creates a web source
14+
*/
15+
export async function webSourceConstructor(
16+
source: string,
17+
options?: WebSourceConstructorOptions,
18+
): Promise<DataSource> {
19+
const { maxDepth = 3, verbose = false } = {
20+
maxDepth: 3,
21+
verbose: false,
22+
...options,
23+
};
24+
return {
25+
name: source,
26+
fetchPages: async () => {
27+
verbose && console.log(`🐶 Fetching source ${source}`);
28+
const pages: Array<Page> = [];
29+
30+
await recursiveCrawlFromBaseURL(
31+
({ document, title, href }) => {
32+
verbose && console.log(`🪲 Crawled page ${title} - ${href}`);
33+
pages.push({
34+
url: href,
35+
title,
36+
body: document.pageContent,
37+
format: 'txt',
38+
sourceName: source,
39+
metadata: {
40+
id: document.id,
41+
...document.metadata,
42+
},
43+
});
44+
},
45+
{
46+
baseUrl: source,
47+
maxDepth,
48+
verbose,
49+
enableRecursion: true,
50+
},
51+
);
52+
53+
return pages;
54+
},
55+
};
56+
}

apps/chatbot-server/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"name": "@leafygreen-ui/chatbot-server",
33
"version": "0.0.1",
44
"description": "",
5+
"type": "module",
56
"main": "index.js",
67
"publishConfig": {
78
"access": "restricted"
@@ -15,6 +16,7 @@
1516
"author": "",
1617
"license": "Apache-2.0",
1718
"dependencies": {
19+
"@lg-tools/crawler": "workspace:^",
1820
"@emotion/css": "^11.13.5",
1921
"dotenv": "^16.5.0",
2022
"jsdom": "^26.1.0"

pnpm-lock.yaml

Lines changed: 19 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tools/crawler/package.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
"main": "./dist/index.js",
77
"module": "./dist/esm/index.js",
88
"types": "./dist/types/index.d.ts",
9+
"exports": {
10+
"types": "./dist/types/index.d.ts",
11+
"require": "./dist/index.js",
12+
"import": "./dist/esm/index.js"
13+
},
914
"bin": {
1015
"lg-crawler": "./bin/cli.js"
1116
},
@@ -42,12 +47,13 @@
4247
"commander": "^13.1.0",
4348
"dotenv": "^16.5.0",
4449
"langchain": "^0.3.24",
45-
"lodash": "^4.17.21",
50+
"lodash-es": "^4.17.21",
4651
"mongodb": "^6.16.0",
4752
"openai": "^4.97.0",
4853
"ora": "^8.2.0"
4954
},
5055
"devDependencies": {
56+
"@types/lodash-es": "^4.17.12",
5157
"@lg-tools/build": "workspace:^",
5258
"@lg-tools/meta": "workspace:^",
5359
"tsx": "^4.19.4"

tools/crawler/src/constants.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,10 @@ export const SOURCES = [
4242
url: 'https://atomicdesign.bradfrost.com/table-of-contents',
4343
collection: 'atomic-design',
4444
},
45-
] as const;
45+
];
4646

4747
/**
4848
* Allow the crawler to follow links to these domains
4949
* (with restricted depth)
5050
*/
51-
export const allowedDomains = [
52-
'https://www.mongodb.com',
53-
'https://github.com',
54-
] as const;
51+
export const allowedDomains = ['https://www.mongodb.com', 'https://github.com'];

tools/crawler/src/crawler.ts

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import chalk from 'chalk';
33

44
import { connectToMongoDB } from './utils/connectToMongoDB';
55
import { createCollectionNameFromURL } from './utils/createCollectionNameFromURL';
6+
import { processLangchainDocument } from './utils/processLangchainDocument';
67
import { recursiveCrawlFromBaseURL } from './utils/recursiveCrawlFromBase';
78
import { SOURCES } from './constants';
89
import { CrawlerOptions } from './types';
@@ -36,14 +37,23 @@ export async function crawl(options: CrawlerOptions) {
3637
`Crawling URL ${url} with collection name ${matchedSource.collection}`,
3738
),
3839
);
39-
await recursiveCrawlFromBaseURL({
40-
baseUrl: url,
41-
collectionName: matchedSource.collection,
42-
mongoClient,
43-
maxDepth,
44-
verbose,
45-
dryRun,
46-
});
40+
await recursiveCrawlFromBaseURL(
41+
({ document, title, href }) => {
42+
processLangchainDocument({
43+
doc: document,
44+
title,
45+
href,
46+
collectionName: matchedSource.collection,
47+
mongoClient,
48+
dryRun,
49+
});
50+
},
51+
{
52+
baseUrl: url,
53+
maxDepth,
54+
verbose,
55+
},
56+
);
4757
} else {
4858
const newCollectionName = createCollectionNameFromURL(url);
4959
verbose &&
@@ -53,14 +63,23 @@ export async function crawl(options: CrawlerOptions) {
5363
),
5464
);
5565

56-
await recursiveCrawlFromBaseURL({
57-
baseUrl: url,
58-
collectionName: newCollectionName,
59-
mongoClient,
60-
maxDepth,
61-
verbose,
62-
dryRun,
63-
});
66+
await recursiveCrawlFromBaseURL(
67+
({ document, title, href }) => {
68+
processLangchainDocument({
69+
doc: document,
70+
title,
71+
href,
72+
collectionName: newCollectionName,
73+
mongoClient,
74+
dryRun,
75+
});
76+
},
77+
{
78+
baseUrl: url,
79+
maxDepth,
80+
verbose,
81+
},
82+
);
6483
}
6584
}
6685
// Otherwise crawl all sources
@@ -71,14 +90,23 @@ export async function crawl(options: CrawlerOptions) {
7190
);
7291

7392
for (const source of SOURCES) {
74-
await recursiveCrawlFromBaseURL({
75-
baseUrl: source.url,
76-
collectionName: source.collection,
77-
mongoClient,
78-
maxDepth,
79-
verbose,
80-
dryRun,
81-
});
93+
await recursiveCrawlFromBaseURL(
94+
({ document, title, href }) => {
95+
processLangchainDocument({
96+
doc: document,
97+
title,
98+
href,
99+
collectionName: source.collection,
100+
mongoClient,
101+
dryRun,
102+
});
103+
},
104+
{
105+
baseUrl: source.url,
106+
maxDepth,
107+
verbose,
108+
},
109+
);
82110
}
83111
}
84112

tools/crawler/src/index.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
export { allowedDomains, MDB_DB, SOURCES } from './constants';
2-
export { processSingleUrl } from './utils/processSingleUrl';
32
export {
3+
type LoadedPageContents,
4+
loadPageContents,
5+
} from './utils/loadPageContents';
6+
export {
7+
processLangchainDocument,
8+
type ProcessLangchainDocumentOptions as ProcessSingleUrlOptions,
9+
type ProcessLangchainDocumentResult as ProcessSingleUrlResult,
10+
} from './utils/processLangchainDocument';
11+
export {
12+
type CrawlerCallback,
413
recursiveCrawlFromBaseURL,
514
type RecursiveCrawlOptions,
615
} from './utils/recursiveCrawlFromBase';

tools/crawler/src/lambda.ts

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { connectToMongoDB } from './utils/connectToMongoDB';
2+
import { processLangchainDocument } from './utils/processLangchainDocument';
23
import { recursiveCrawlFromBaseURL } from './utils/recursiveCrawlFromBase';
34
import { SOURCES } from './constants';
45

@@ -17,12 +18,20 @@ exports.handler = async (_e: any) => {
1718

1819
try {
1920
for (const source of SOURCES) {
20-
await recursiveCrawlFromBaseURL({
21-
baseUrl: source.url,
22-
collectionName: source.collection,
23-
mongoClient,
24-
maxDepth: 3,
25-
});
21+
await recursiveCrawlFromBaseURL(
22+
({ document, title, href }) =>
23+
processLangchainDocument({
24+
doc: document,
25+
title,
26+
href,
27+
collectionName: source.collection,
28+
mongoClient,
29+
}),
30+
{
31+
baseUrl: source.url,
32+
maxDepth: 3,
33+
},
34+
);
2635
}
2736
} catch (error) {
2837
console.error('Error during crawling:', error);

0 commit comments

Comments
 (0)