Skip to content

Commit 1298530

Browse files
authored
Merge pull request #20 from mudler/feat/speedup-add-documents
feat: Speedup adding batch of documents
2 parents 1013e4e + 9875fc4 commit 1298530

File tree

6 files changed

+61
-10
lines changed

6 files changed

+61
-10
lines changed

rag/engine.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77

88
type Engine interface {
99
Store(s string, metadata map[string]string) (engine.Result, error)
10+
StoreDocuments(s []string, metadata map[string]string) ([]engine.Result, error)
1011
Reset() error
1112
Search(s string, similarEntries int) ([]types.Result, error)
1213
Count() int

rag/engine/chromem.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,35 @@ func (c *ChromemDB) Store(s string, metadata map[string]string) (Result, error)
106106
}, nil
107107
}
108108

109+
func (c *ChromemDB) StoreDocuments(s []string, metadata map[string]string) ([]Result, error) {
110+
defer func() {
111+
c.index += len(s)
112+
}()
113+
114+
if len(s) == 0 {
115+
return nil, fmt.Errorf("empty string")
116+
}
117+
118+
results := make([]Result, len(s))
119+
documents := make([]chromem.Document, len(s))
120+
for i, content := range s {
121+
documents[i] = chromem.Document{
122+
Metadata: metadata,
123+
Content: content,
124+
ID: fmt.Sprint(c.index + i),
125+
}
126+
results[i] = Result{
127+
ID: fmt.Sprint(c.index + i),
128+
}
129+
}
130+
131+
if err := c.collection.AddDocuments(context.Background(), documents, runtime.NumCPU()); err != nil {
132+
return nil, err
133+
}
134+
135+
return results, nil
136+
}
137+
109138
func (c *ChromemDB) Delete(where map[string]string, whereDocuments map[string]string, ids ...string) error {
110139
return c.collection.Delete(context.Background(), where, whereDocuments, ids...)
111140
}

rag/engine/localai.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@ func (db *LocalAIRAGDB) Count() int {
3131
return 0
3232
}
3333

34+
func (db *LocalAIRAGDB) StoreDocuments(s []string, metadata map[string]string) ([]Result, error) {
35+
results := []Result{}
36+
for _, content := range s {
37+
result, err := db.Store(content, metadata)
38+
if err != nil {
39+
return nil, err
40+
}
41+
results = append(results, result)
42+
}
43+
return results, nil
44+
}
45+
3446
func (db *LocalAIRAGDB) Store(s string, metadata map[string]string) (Result, error) {
3547
resp, err := db.openaiClient.CreateEmbeddings(context.TODO(),
3648
openai.EmbeddingRequestStrings{

rag/persistency.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ func (db *PersistentKB) Store(entry string, metadata map[string]string) error {
197197
}
198198

199199
func (db *PersistentKB) storeFile(entry string, metadata map[string]string) error {
200+
xlog.Info("Storing file", "entry", entry)
200201
fileName := filepath.Base(entry)
201202

202203
// copy file to assetDir (if it's a file)
@@ -217,6 +218,7 @@ func (db *PersistentKB) storeFile(entry string, metadata map[string]string) erro
217218
}
218219

219220
func (db *PersistentKB) StoreOrReplace(entry string, metadata map[string]string) error {
221+
xlog.Info("Storing or replacing entry", "entry", entry)
220222
db.Lock()
221223
defer db.Unlock()
222224

@@ -234,22 +236,23 @@ func (db *PersistentKB) StoreOrReplace(entry string, metadata map[string]string)
234236
}
235237

236238
func (db *PersistentKB) store(metadata map[string]string, files ...string) ([]engine.Result, error) {
239+
xlog.Info("Storing files", "files", files)
237240
results := []engine.Result{}
241+
238242
for _, c := range files {
239243
e := filepath.Join(db.assetDir, filepath.Base(c))
240244
pieces, err := chunkFile(e, db.maxChunkSize)
241245
if err != nil {
242246
return nil, err
243247
}
244-
for _, p := range pieces {
245-
metadata["type"] = "file"
246-
metadata["source"] = c
247-
res, err := db.Engine.Store(p, metadata)
248-
if err != nil {
249-
return nil, err
250-
}
251-
results = append(results, res)
248+
metadata["type"] = "file"
249+
metadata["source"] = c
250+
xlog.Info("Storing pieces", "pieces", pieces, "metadata", metadata)
251+
res, err := db.Engine.StoreDocuments(pieces, metadata)
252+
if err != nil {
253+
return nil, err
252254
}
255+
results = append(results, res...)
253256
db.index[c] = results
254257
}
255258

rag/source_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func (sm *SourceManager) updateSource(collectionName string, source ExternalSour
125125
return
126126
}
127127

128-
xlog.Info("Content", "content", content)
128+
//xlog.Info("Content", "content", content)
129129

130130
// Create a temporary file to store the content
131131
sanitizedURL := sanitizeURL(source.URL)

rag/sources/router.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
package sources
22

3-
import "strings"
3+
import (
4+
"strings"
5+
6+
"github.com/mudler/localrecall/pkg/xlog"
7+
)
48

59
func SourceRouter(url string) (string, error) {
610

11+
xlog.Info("Downloading content from", "url", url)
712
switch {
813
case strings.HasSuffix(url, "sitemap.xml"):
914
content, err := GetWebSitemapContent(url)
1015
if err != nil {
1116
return "", err
1217
}
18+
xlog.Info("Downloaded all content from sitemap", "url", url, "length", len(content))
1319
return strings.Join(content, "\n"), nil
1420
}
1521

0 commit comments

Comments
 (0)