Skip to content

Commit 026416f

Browse files
committed
feat(ingest): add MongoDB Design website data source and integrate into ingest configuration
1 parent dbf0676 commit 026416f

File tree

2 files changed

+183
-0
lines changed

2 files changed

+183
-0
lines changed

apps/chatbot-server/ingest/ingest.config.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { Config, makeIngestMetaStore } from 'mongodb-rag-ingest';
77

88
import { leafygreenGithubSourceConstructor } from './sources/github-leafygreen-ui';
99
import { mongoDbChatbotFrameworkDocsDataSourceConstructor } from './sources/github-mdb-chatbot-framework';
10+
import { mongoDbDesignDataSourceConstructor } from './sources/mongodb-design';
1011
import { createAzureEmbedderConstructor } from './utils/createAzureEmbedderConstructor';
1112
import { loadEnvVars } from './utils/loadEnv';
1213

@@ -51,6 +52,7 @@ export default {
5152
return Promise.all([
5253
mongoDbChatbotFrameworkDocsDataSourceConstructor(),
5354
leafygreenGithubSourceConstructor(),
55+
mongoDbDesignDataSourceConstructor(), // Add MongoDB Design website data source
5456
]);
5557
},
5658
} satisfies Config;
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/**
2+
* @fileoverview Data source for the MongoDB Design website (mongodb.design)
3+
*/
4+
import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio';
5+
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
6+
import { DataSource, DataSourcePage, Page } from 'mongodb-rag-core';
7+
8+
/**
9+
* Creates a data source for the MongoDB Design website
10+
*/
11+
export const mongoDbDesignDataSourceConstructor =
12+
async (): Promise<DataSource> => {
13+
return {
14+
name: 'mongodb-design-website',
15+
metadata: {
16+
productName: 'MongoDB Design System',
17+
version: '1.0.0',
18+
tags: ['design', 'ui', 'components', 'leafygreen'],
19+
},
20+
21+
async *getPages(): AsyncGenerator<DataSourcePage> {
22+
// Base URL for MongoDB Design website
23+
const baseUrl = 'https://www.mongodb.design';
24+
25+
// Define the sections/paths to crawl
26+
const pagesToCrawl = [
27+
'/component',
28+
'/guidelines',
29+
'/foundation',
30+
'/resources',
31+
];
32+
33+
// Process each top-level page
34+
for (const pagePath of pagesToCrawl) {
35+
const pageUrl = `${baseUrl}${pagePath}`;
36+
try {
37+
// Use LangChain's CheerioWebBaseLoader to fetch and parse the page content
38+
const loader = new CheerioWebBaseLoader(pageUrl, {
39+
selector: 'body',
40+
});
41+
42+
// Load the content and clean it up
43+
const cheerio = await loader.scrape();
44+
45+
// Remove script and style tags to clean up content
46+
cheerio('script').remove();
47+
cheerio('style').remove();
48+
cheerio('noscript').remove();
49+
50+
// Get the page title
51+
const title =
52+
cheerio('title').text() ||
53+
cheerio('h1').first().text() ||
54+
pagePath.split('/').pop() ||
55+
'MongoDB Design';
56+
57+
// Get the main content
58+
const content = cheerio('body').text();
59+
60+
// Split text into chunks for better processing
61+
const textSplitter = new RecursiveCharacterTextSplitter({
62+
chunkSize: 1000,
63+
chunkOverlap: 200,
64+
});
65+
66+
const chunks = await textSplitter.splitText(content);
67+
68+
// Find component links if we're on the components page
69+
if (pagePath === '/component') {
70+
// Extract all component links
71+
const componentLinks = cheerio('a')
72+
.map((_i, el) => {
73+
const href = cheerio(el).attr('href');
74+
if (
75+
href &&
76+
href.startsWith('/component/') &&
77+
href.includes('/live-example')
78+
) {
79+
return href;
80+
}
81+
return null;
82+
})
83+
.get()
84+
.filter(Boolean) as string[];
85+
86+
// Process each component page
87+
for (const componentLink of componentLinks) {
88+
yield* await processComponentPage(`${baseUrl}${componentLink}`);
89+
}
90+
}
91+
92+
// Create and yield the page
93+
for (let i = 0; i < chunks.length; i++) {
94+
const pageData: Page = {
95+
url: pageUrl,
96+
title: `${title} - Part ${i + 1}`,
97+
content: chunks[i],
98+
metadata: {
99+
source: 'mongodb-design',
100+
section: pagePath.substring(1),
101+
chunkIndex: i,
102+
},
103+
};
104+
105+
yield {
106+
page: pageData,
107+
};
108+
}
109+
} catch (error) {
110+
console.error(`Error processing ${pageUrl}:`, error);
111+
}
112+
}
113+
},
114+
};
115+
};
116+
117+
/**
118+
* Helper function to process individual component pages
119+
*/
120+
async function* processComponentPage(
121+
url: string,
122+
): AsyncGenerator<DataSourcePage> {
123+
try {
124+
// Load the component page
125+
const loader = new CheerioWebBaseLoader(url, {
126+
selector: 'body',
127+
});
128+
129+
const cheerio = await loader.scrape();
130+
131+
// Clean up the content
132+
cheerio('script').remove();
133+
cheerio('style').remove();
134+
cheerio('noscript').remove();
135+
136+
// Extract component name from URL
137+
const componentName = url
138+
.split('/component/')[1]
139+
.split('/')[0]
140+
.split('-')
141+
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
142+
.join(' ');
143+
144+
const title =
145+
cheerio('title').text() ||
146+
cheerio('h1').first().text() ||
147+
componentName ||
148+
'Component';
149+
150+
const content = cheerio('body').text();
151+
152+
// Split text into chunks
153+
const textSplitter = new RecursiveCharacterTextSplitter({
154+
chunkSize: 1000,
155+
chunkOverlap: 200,
156+
});
157+
158+
const chunks = await textSplitter.splitText(content);
159+
160+
// Create and yield the page
161+
for (let i = 0; i < chunks.length; i++) {
162+
const pageData: Page = {
163+
url,
164+
title: `${title} - Part ${i + 1}`,
165+
content: chunks[i],
166+
metadata: {
167+
source: 'mongodb-design',
168+
section: 'component',
169+
componentName,
170+
chunkIndex: i,
171+
},
172+
};
173+
174+
yield {
175+
page: pageData,
176+
};
177+
}
178+
} catch (error) {
179+
console.error(`Error processing component page ${url}:`, error);
180+
}
181+
}

0 commit comments

Comments
 (0)