Skip to content

Commit 49380e4

Browse files
committed
Squashed commit of the following:
commit 3eec08f Author: Adam Thompson <2414030+TheSonOfThomp@users.noreply.github.com> Date: Fri May 9 10:46:31 2025 -0400 Ingest with recursive crawler (#2849) * feat(ui): initialize chatbot UI with React and Vite (#2841) * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ingest): add initial configuration and data sources for chatbot ingestion * update configs * feat(ingest): implement data sources for LeafyGreen UI and MongoDB Chatbot Framework * chore: update pnpm workspace configuration to include 'apps/*' directory * feat(ingest): add data sources for LeafyGreen UI and MongoDB Chatbot Framework, and refactor Azure OpenAI embedder * feat(ingest): add MongoDB Design website data source and integrate into ingest configuration * Delete package-lock.json * reset ui * feat(crawler): export LoadedPageContents and ProcessSingleUrl interfaces * feat(crawler): refactor crawling logic to use processLangchainDocument and update allowedDomains format * feat(crawler): update recursive crawl logic to process documents with processLangchainDocument * feat(crawler): update package.json exports and refactor lodash imports * feat(crawler): implement createWebSourceConstructor and update ingest configuration * Update ingest.config.ts * feat(crawler): replace createWebSourceConstructor with webSourceConstructor and update data sources * fix(crawler): handle invalid URLs in newURL and recursive crawl logic * Resolve tsconfig.json --------- Co-authored-by: Terrence Keane <terrence.keane@mongodb.com>
1 parent cd04b5c commit 49380e4

File tree

1 file changed

+56
-0
lines changed

1 file changed

+56
-0
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* eslint-disable no-console */
2+
import { recursiveCrawlFromBaseURL } from '@lg-tools/crawler';
3+
import { Page } from 'mongodb-rag-core';
4+
import { type DataSource } from 'mongodb-rag-core/dataSources';
5+
6+
export interface WebSourceConstructorOptions {
7+
maxDepth?: number;
8+
verbose?: boolean;
9+
}
10+
11+
/**
12+
* Returns a constructor function
13+
* that creates a web source
14+
*/
15+
export async function webSourceConstructor(
16+
source: string,
17+
options?: WebSourceConstructorOptions,
18+
): Promise<DataSource> {
19+
const { maxDepth = 3, verbose = false } = {
20+
maxDepth: 3,
21+
verbose: false,
22+
...options,
23+
};
24+
return {
25+
name: source,
26+
fetchPages: async () => {
27+
verbose && console.log(`🐶 Fetching source ${source}`);
28+
const pages: Array<Page> = [];
29+
30+
await recursiveCrawlFromBaseURL(
31+
({ document, title, href }) => {
32+
verbose && console.log(`🪲 Crawled page ${title} - ${href}`);
33+
pages.push({
34+
url: href,
35+
title,
36+
body: document.pageContent,
37+
format: 'txt',
38+
sourceName: source,
39+
metadata: {
40+
id: document.id,
41+
...document.metadata,
42+
},
43+
});
44+
},
45+
{
46+
baseUrl: source,
47+
maxDepth,
48+
verbose,
49+
enableRecursion: true,
50+
},
51+
);
52+
53+
return pages;
54+
},
55+
};
56+
}

0 commit comments

Comments
 (0)