-
Notifications
You must be signed in to change notification settings - Fork 189
/
Copy pathscrape-embed.ts
65 lines (60 loc) · 2.17 KB
/
scrape-embed.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import { Document } from 'langchain/document';
import * as fs from 'fs/promises';
import { CustomWebLoader } from '@/utils/custom_web_loader';
import type { SupabaseClient } from '@supabase/supabase-js';
import { Embeddings, OpenAIEmbeddings } from 'langchain/embeddings';
import { SupabaseVectorStore } from 'langchain/vectorstores';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { supabaseClient } from '@/utils/supabase-client';
import { urls } from '@/config/notionurls';
async function extractDataFromUrl(url: string): Promise<Document[]> {
try {
const loader = new CustomWebLoader(url);
const docs = await loader.load();
return docs;
} catch (error) {
console.error(`Error while extracting data from ${url}: ${error}`);
return [];
}
}
async function extractDataFromUrls(urls: string[]): Promise<Document[]> {
console.log('extracting data from urls...');
const documents: Document[] = [];
for (const url of urls) {
const docs = await extractDataFromUrl(url);
documents.push(...docs);
}
console.log('data extracted from urls');
const json = JSON.stringify(documents);
await fs.writeFile('franknotion.json', json);
console.log('json file containing data saved on disk');
return documents;
}
async function embedDocuments(
client: SupabaseClient,
docs: Document[],
embeddings: Embeddings,
) {
console.log('creating embeddings...');
await SupabaseVectorStore.fromDocuments(client, docs, embeddings);
console.log('embeddings successfully stored in supabase');
}
async function splitDocsIntoChunks(docs: Document[]): Promise<Document[]> {
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 2000,
chunkOverlap: 200,
});
return await textSplitter.splitDocuments(docs);
}
(async function run(urls: string[]) {
try {
//load data from each url
const rawDocs = await extractDataFromUrls(urls);
//split docs into chunks for openai context window
const docs = await splitDocsIntoChunks(rawDocs);
//embed docs into supabase
await embedDocuments(supabaseClient, docs, new OpenAIEmbeddings());
} catch (error) {
console.log('error occured:', error);
}
})(urls);