Ingest with recursive crawler (#2849)

TheSonOfThomp · tsck · web-flow · commit 3eec08f1aeb5 · 2025-05-09T10:46:31.000-04:00
* feat(ui): initialize chatbot UI with React and Vite (#2841) * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ui): initialize chatbot UI with React and Vite - Added package.json for UI package with scripts and dependencies. - Created App.css for styling the main application layout. - Implemented App.tsx as the main component integrating the MongoDB Chatbot UI. - Added a React SVG asset for branding. - Created index.css for global styles and theming. - Set up main.tsx as the entry point for the React application. - Added vite-env.d.ts for Vite type definitions. - Configured TypeScript with tsconfig.json and tsconfig.node.json for the UI package. - Created vite.config.ts for Vite configuration with React plugin. * fix: update license to Apache-2.0 and refactor import statements for consistency * feat(ingest): add initial configuration and data sources for chatbot ingestion * update configs * feat(ingest): implement data sources for LeafyGreen UI and MongoDB Chatbot Framework * chore: update pnpm workspace configuration to include 'apps/*' directory * feat(ingest): add data sources for LeafyGreen UI and MongoDB Chatbot Framework, and refactor Azure OpenAI embedder * feat(ingest): add MongoDB Design website data source and integrate into ingest configuration * Delete package-lock.json * reset ui * feat(crawler): export LoadedPageContents and ProcessSingleUrl interfaces * feat(crawler): refactor crawling logic to use processLangchainDocument and update allowedDomains format * feat(crawler): update recursive crawl logic to process documents with processLangchainDocument * feat(crawler): update package.json exports and refactor lodash imports * feat(crawler): implement createWebSourceConstructor and update ingest configuration * Update ingest.config.ts * feat(crawler): replace createWebSourceConstructor with webSourceConstructor and update data sources * fix(crawler): handle invalid URLs in newURL and recursive crawl logic * Resolve tsconfig.json --------- Co-authored-by: Terrence Keane <terrence.keane@mongodb.com>
diff --git a/apps/chatbot-server/ingest/ingest.config.ts b/apps/chatbot-server/ingest/ingest.config.ts
@@ -5,10 +5,11 @@ import {
 import { AzureOpenAI } from 'mongodb-rag-core/openai';
 import { Config, makeIngestMetaStore } from 'mongodb-rag-ingest';
 
-import { leafygreenGithubSourceConstructor } from './sources/github-leafygreen-ui';
-import { mongoDbChatbotFrameworkDocsDataSourceConstructor } from './sources/github-mdb-chatbot-framework';
-import { createAzureEmbedderConstructor } from './utils/createAzureEmbedderConstructor';
-import { loadEnvVars } from './utils/loadEnv';
+import { leafygreenGithubSourceConstructor } from './sources/github-leafygreen-ui.js';
+import { mongoDbChatbotFrameworkDocsDataSourceConstructor } from './sources/github-mdb-chatbot-framework.js';
+import { createAzureEmbedderConstructor } from './utils/createAzureEmbedderConstructor.js';
+import { loadEnvVars } from './utils/loadEnv.js';
+import { webSourceConstructor } from './utils/webSourceConstructor.js';
 
 // Load project environment variables
 const {
@@ -49,6 +50,15 @@ export default {
   // Add data sources here
   dataSources: async () => {
     return Promise.all([
+      ...[
+        'https://mongodb.design',
+        'https://react.dev/reference/react',
+        'https://developer.mozilla.org/en-US/docs/Web',
+        'https://css-tricks.com/category/articles',
+        'https://www.nngroup.com/articles',
+        'https://www.w3.org/WAI/standards-guidelines/wcag',
+        'https://atomicdesign.bradfrost.com/table-of-contents',
+      ].map(source => webSourceConstructor(source, {})),
       mongoDbChatbotFrameworkDocsDataSourceConstructor(),
       leafygreenGithubSourceConstructor(),
     ]);
diff --git a/apps/chatbot-server/ingest/tsconfig.json b/apps/chatbot-server/ingest/tsconfig.json
@@ -16,4 +16,9 @@
     "./ingest.config.ts"
   ],
   "include": ["./**/*.ts"],
+  "references": [
+    {
+      "path": "../../../tools/crawler"
+    },
+  ]
 }
diff --git a/apps/chatbot-server/ingest/utils/webSourceConstructor.ts b/apps/chatbot-server/ingest/utils/webSourceConstructor.ts
@@ -0,0 +1,56 @@
+/* eslint-disable no-console */
+import { recursiveCrawlFromBaseURL } from '@lg-tools/crawler';
+import { Page } from 'mongodb-rag-core';
+import { type DataSource } from 'mongodb-rag-core/dataSources';
+
+export interface WebSourceConstructorOptions {
+  maxDepth?: number;
+  verbose?: boolean;
+}
+
+/**
+ * Returns a constructor function
+ * that creates a web source
+ */
+export async function webSourceConstructor(
+  source: string,
+  options?: WebSourceConstructorOptions,
+): Promise<DataSource> {
+  const { maxDepth = 3, verbose = false } = {
+    maxDepth: 3,
+    verbose: false,
+    ...options,
+  };
+  return {
+    name: source,
+    fetchPages: async () => {
+      verbose && console.log(`🐶 Fetching source ${source}`);
+      const pages: Array<Page> = [];
+
+      await recursiveCrawlFromBaseURL(
+        ({ document, title, href }) => {
+          verbose && console.log(`🪲 Crawled page ${title} - ${href}`);
+          pages.push({
+            url: href,
+            title,
+            body: document.pageContent,
+            format: 'txt',
+            sourceName: source,
+            metadata: {
+              id: document.id,
+              ...document.metadata,
+            },
+          });
+        },
+        {
+          baseUrl: source,
+          maxDepth,
+          verbose,
+          enableRecursion: true,
+        },
+      );
+
+      return pages;
+    },
+  };
+}
diff --git a/apps/chatbot-server/package.json b/apps/chatbot-server/package.json
@@ -2,6 +2,7 @@
   "name": "@leafygreen-ui/chatbot-server",
   "version": "0.0.1",
   "description": "",
+  "type": "module",
   "main": "index.js",
   "publishConfig": {
     "access": "restricted"
@@ -15,6 +16,7 @@
   "author": "",
   "license": "Apache-2.0",
   "dependencies": {
+    "@lg-tools/crawler": "workspace:^",
     "@emotion/css": "^11.13.5",
     "dotenv": "^16.5.0",
     "jsdom": "^26.1.0"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/tools/crawler/package.json b/tools/crawler/package.json
@@ -6,6 +6,11 @@
   "main": "./dist/index.js",
   "module": "./dist/esm/index.js",
   "types": "./dist/types/index.d.ts",
+  "exports": {
+    "types": "./dist/types/index.d.ts",
+    "require": "./dist/index.js",
+    "import": "./dist/esm/index.js"
+  },
   "bin": {
     "lg-crawler": "./bin/cli.js"
   },
@@ -42,12 +47,13 @@
     "commander": "^13.1.0",
     "dotenv": "^16.5.0",
     "langchain": "^0.3.24",
-    "lodash": "^4.17.21",
+    "lodash-es": "^4.17.21",
     "mongodb": "^6.16.0",
     "openai": "^4.97.0",
     "ora": "^8.2.0"
   },
   "devDependencies": {
+    "@types/lodash-es": "^4.17.12",
     "@lg-tools/build": "workspace:^",
     "@lg-tools/meta": "workspace:^",
     "tsx": "^4.19.4"
diff --git a/tools/crawler/src/constants.ts b/tools/crawler/src/constants.ts
@@ -42,13 +42,10 @@ export const SOURCES = [
     url: 'https://atomicdesign.bradfrost.com/table-of-contents',
     collection: 'atomic-design',
   },
-] as const;
+];
 
 /**
  * Allow the crawler to follow links to these domains
  * (with restricted depth)
  */
-export const allowedDomains = [
-  'https://www.mongodb.com',
-  'https://github.com',
-] as const;
+export const allowedDomains = ['https://www.mongodb.com', 'https://github.com'];
diff --git a/tools/crawler/src/crawler.ts b/tools/crawler/src/crawler.ts
@@ -3,6 +3,7 @@ import chalk from 'chalk';
 
 import { connectToMongoDB } from './utils/connectToMongoDB';
 import { createCollectionNameFromURL } from './utils/createCollectionNameFromURL';
+import { processLangchainDocument } from './utils/processLangchainDocument';
 import { recursiveCrawlFromBaseURL } from './utils/recursiveCrawlFromBase';
 import { SOURCES } from './constants';
 import { CrawlerOptions } from './types';
@@ -36,14 +37,23 @@ export async function crawl(options: CrawlerOptions) {
               `Crawling URL ${url} with collection name ${matchedSource.collection}`,
             ),
           );
-        await recursiveCrawlFromBaseURL({
-          baseUrl: url,
-          collectionName: matchedSource.collection,
-          mongoClient,
-          maxDepth,
-          verbose,
-          dryRun,
-        });
+        await recursiveCrawlFromBaseURL(
+          ({ document, title, href }) => {
+            processLangchainDocument({
+              doc: document,
+              title,
+              href,
+              collectionName: matchedSource.collection,
+              mongoClient,
+              dryRun,
+            });
+          },
+          {
+            baseUrl: url,
+            maxDepth,
+            verbose,
+          },
+        );
       } else {
         const newCollectionName = createCollectionNameFromURL(url);
         verbose &&
@@ -53,14 +63,23 @@ export async function crawl(options: CrawlerOptions) {
             ),
           );
 
-        await recursiveCrawlFromBaseURL({
-          baseUrl: url,
-          collectionName: newCollectionName,
-          mongoClient,
-          maxDepth,
-          verbose,
-          dryRun,
-        });
+        await recursiveCrawlFromBaseURL(
+          ({ document, title, href }) => {
+            processLangchainDocument({
+              doc: document,
+              title,
+              href,
+              collectionName: newCollectionName,
+              mongoClient,
+              dryRun,
+            });
+          },
+          {
+            baseUrl: url,
+            maxDepth,
+            verbose,
+          },
+        );
       }
     }
     // Otherwise crawl all sources
@@ -71,14 +90,23 @@ export async function crawl(options: CrawlerOptions) {
         );
 
       for (const source of SOURCES) {
-        await recursiveCrawlFromBaseURL({
-          baseUrl: source.url,
-          collectionName: source.collection,
-          mongoClient,
-          maxDepth,
-          verbose,
-          dryRun,
-        });
+        await recursiveCrawlFromBaseURL(
+          ({ document, title, href }) => {
+            processLangchainDocument({
+              doc: document,
+              title,
+              href,
+              collectionName: source.collection,
+              mongoClient,
+              dryRun,
+            });
+          },
+          {
+            baseUrl: source.url,
+            maxDepth,
+            verbose,
+          },
+        );
       }
     }
 
diff --git a/tools/crawler/src/index.ts b/tools/crawler/src/index.ts
@@ -1,6 +1,15 @@
 export { allowedDomains, MDB_DB, SOURCES } from './constants';
-export { processSingleUrl } from './utils/processSingleUrl';
 export {
+  type LoadedPageContents,
+  loadPageContents,
+} from './utils/loadPageContents';
+export {
+  processLangchainDocument,
+  type ProcessLangchainDocumentOptions as ProcessSingleUrlOptions,
+  type ProcessLangchainDocumentResult as ProcessSingleUrlResult,
+} from './utils/processLangchainDocument';
+export {
+  type CrawlerCallback,
   recursiveCrawlFromBaseURL,
   type RecursiveCrawlOptions,
 } from './utils/recursiveCrawlFromBase';
diff --git a/tools/crawler/src/lambda.ts b/tools/crawler/src/lambda.ts
@@ -1,4 +1,5 @@
 import { connectToMongoDB } from './utils/connectToMongoDB';
+import { processLangchainDocument } from './utils/processLangchainDocument';
 import { recursiveCrawlFromBaseURL } from './utils/recursiveCrawlFromBase';
 import { SOURCES } from './constants';
 
@@ -17,12 +18,20 @@ exports.handler = async (_e: any) => {
 
   try {
     for (const source of SOURCES) {
-      await recursiveCrawlFromBaseURL({
-        baseUrl: source.url,
-        collectionName: source.collection,
-        mongoClient,
-        maxDepth: 3,
-      });
+      await recursiveCrawlFromBaseURL(
+        ({ document, title, href }) =>
+          processLangchainDocument({
+            doc: document,
+            title,
+            href,
+            collectionName: source.collection,
+            mongoClient,
+          }),
+        {
+          baseUrl: source.url,
+          maxDepth: 3,
+        },
+      );
     }
   } catch (error) {
     console.error('Error during crawling:', error);
diff --git a/tools/crawler/src/utils/loadPageContents.ts b/tools/crawler/src/utils/loadPageContents.ts
@@ -1,7 +1,7 @@
 import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio';
 import { Document as LangChainDocument } from '@langchain/core/documents';
 
-interface LoadedPageContents {
+export interface LoadedPageContents {
   doc: LangChainDocument;
   title: string;
   links: Array<string>;
diff --git a/tools/crawler/src/utils/newURL.ts b/tools/crawler/src/utils/newURL.ts
@@ -5,6 +5,6 @@ export const newURL = (href: string) => {
     return new URL(href);
   } catch (error) {
     console.error(chalk.red(`Invalid URL: ${href}`), error);
-    process.exit(1);
+    return null;
   }
 };
diff --git a/tools/crawler/src/utils/processLangchainDocument.ts b/tools/crawler/src/utils/processLangchainDocument.ts
diff --git a/tools/crawler/src/utils/recursiveCrawlFromBase.ts b/tools/crawler/src/utils/recursiveCrawlFromBase.ts
diff --git a/tools/crawler/tsconfig.json b/tools/crawler/tsconfig.json