Skip to content

feat: support URLs with sitemaps #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ localrecall
collections/
state/
assets/

.env
localrag
.env
46 changes: 46 additions & 0 deletions pkg/chunk/chunking.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package chunk

import (
"strings"
)

// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
// and returns a slice of strings where each string is a chunk of the paragraph
// that is at most maxChunkSize long, ensuring that words are not split.
func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
if len(paragraph) <= maxChunkSize {
return []string{paragraph}
}

var chunks []string
var currentChunk strings.Builder

words := strings.Fields(paragraph) // Splits the paragraph into words.

for _, word := range words {
// If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk),
// add the currentChunk to chunks, and reset currentChunk.
if currentChunk.Len() > 0 && currentChunk.Len()+len(word)+1 > maxChunkSize { // +1 for the space if not the first word
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
} else if currentChunk.Len() == 0 && len(word) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word
chunks = append(chunks, word)
continue
}

// Add a space before the word if it's not the beginning of a new chunk.
if currentChunk.Len() > 0 {
currentChunk.WriteString(" ")
}

// Add the word to the current chunk.
currentChunk.WriteString(word)
}

// After the loop, add any remaining content in currentChunk to chunks.
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
}

return chunks
}
150 changes: 0 additions & 150 deletions rag/chunking.go

This file was deleted.

49 changes: 49 additions & 0 deletions rag/persistency.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
package rag

import (
"bytes"
"encoding/json"
"fmt"
"io"
"path/filepath"

"os"
"sync"

"github.com/dslipak/pdf"
"github.com/mudler/localrecall/pkg/chunk"
"github.com/mudler/localrecall/pkg/xlog"
"github.com/mudler/localrecall/rag/engine"
)
Expand Down Expand Up @@ -289,6 +293,51 @@ func copyFile(src, dst string) error {
return os.WriteFile(filepath.Join(dst, filepath.Base(src)), in, 0644)
}

func chunkFile(fpath string, maxchunksize int) ([]string, error) {
if _, err := os.Stat(fpath); os.IsNotExist(err) {
return nil, fmt.Errorf("file does not exist: %s", fpath)
}

// Get file extension:
// If it's a .txt file, read the file and split it into chunks.
// If it's a .pdf file, convert it to text and split it into chunks.
// ...
extension := filepath.Ext(fpath)
switch extension {
case ".pdf":
r, err := pdf.Open(fpath)
if err != nil {
return nil, err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return nil, err
}
buf.ReadFrom(b)
return chunk.SplitParagraphIntoChunks(buf.String(), maxchunksize), nil
case ".txt", ".md":
xlog.Debug("Reading text file: ", fpath)
f, err := os.Open(fpath)
if err != nil {
xlog.Error("Error opening file: ", fpath)
return nil, err
}
defer f.Close()
content, err := io.ReadAll(f)
if err != nil {
xlog.Error("Error reading file: ", fpath)
return nil, err
}
return chunk.SplitParagraphIntoChunks(string(content), maxchunksize), nil

default:
xlog.Error("Unsupported file type: ", extension)
}

return nil, fmt.Errorf("not implemented")
}

// GetExternalSources returns the list of external sources for this collection
func (db *PersistentKB) GetExternalSources() []ExternalSource {
db.Lock()
Expand Down
3 changes: 2 additions & 1 deletion rag/source_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/mudler/localrecall/pkg/xlog"
"github.com/mudler/localrecall/rag/sources"
)

// ExternalSource represents a source that needs to be periodically updated
Expand Down Expand Up @@ -112,7 +113,7 @@ func (sm *SourceManager) RemoveSource(collectionName, url string) error {
func (sm *SourceManager) updateSource(collectionName string, source ExternalSource, collection *PersistentKB) {

xlog.Info("Updating source", "url", source.URL)
content, err := GetWebPage(source.URL)
content, err := sources.SourceRouter(source.URL)
if err != nil {
xlog.Error("Error updating source", err)
return
Expand Down
17 changes: 17 additions & 0 deletions rag/sources/router.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package sources

import "strings"

func SourceRouter(url string) (string, error) {

switch {
case strings.HasSuffix(url, "sitemap.xml"):
content, err := GetWebSitemapContent(url)
if err != nil {
return "", err
}
return strings.Join(content, "\n"), nil
}

return GetWebPage(url)
}
36 changes: 36 additions & 0 deletions rag/sources/web.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package sources

import (
"io"
"net/http"

"github.com/mudler/localrecall/pkg/xlog"
sitemap "github.com/oxffaa/gopher-parse-sitemap"
"jaytaylor.com/html2text"
)

func GetWebPage(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return html2text.FromString(string(body), html2text.Options{PrettyTables: true})
}

func GetWebSitemapContent(url string) (res []string, err error) {
err = sitemap.ParseFromSite(url, func(e sitemap.Entry) error {
xlog.Info("Sitemap page: " + e.GetLocation())
content, err := GetWebPage(e.GetLocation())
if err == nil {
res = append(res, content)
}
return nil
})
return
}