Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<p align="center">
<img src="./doc/golem.svg" height="240" />
<h3 align="center">Embeddings</h3>
<p align="center"><strong>adapter over various popular vector embeddings interfaces: AWS BedRock, OpenAI, word2vec</strong></p>

Expand Down
59 changes: 41 additions & 18 deletions scanner/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,15 @@ import (
//
// Scanning stops unrecoverably at EOF or the first I/O error.
type Scanner struct {
embed embeddings.Embeddings
similarity func([]float32, []float32) bool
windowInSentences int
scanner Reader
err error
eof bool
window []vector
cursor []string
embed embeddings.Embeddings
confSimilarity func([]float32, []float32) bool
confWindowInSentences int
confSimilarityWith SimilarityWith
scanner Reader
err error
eof bool
window []vector
cursor []string
}

// Reader is an interface similar to [bufio.Scanner].
Expand All @@ -62,24 +63,37 @@ type vector struct {
// Creates new instance of Scanner to read from io.Reader and using embedding.
func New(embed embeddings.Embeddings, r Reader) *Scanner {
return &Scanner{
embed: embed,
similarity: HighSimilarity,
windowInSentences: 32,
scanner: r,
window: make([]vector, 0),
embed: embed,
confSimilarity: HighSimilarity,
confWindowInSentences: 32,
confSimilarityWith: SIMILARITY_WITH_TAIL,
scanner: r,
window: make([]vector, 0),
}
}

// Similarity sets the similarity function for the Scanner.
// The default is HighSimilarity.
func (s *Scanner) Similarity(f func([]float32, []float32) bool) {
s.similarity = f
s.confSimilarity = f
}

// Similarity sets the behavior to sorting algorithms.
//
// Using SIMILARITY_WITH_HEAD configures algorithm to sort chunk similar
// to the first element of chunk. The first element of chunk is stable during
// the chunk forming.
//
// Using SIMILARITY_WITH_TAIL configures algorithm to sort chunk similar
// to the last element of chunk. The last element is changed after new one is added to chunk.
func (s *Scanner) SimilarityWith(x SimilarityWith) {
s.confSimilarityWith = x
}

// Widow defines the context window for similarity detection.
// The default value is 32 sentences.
func (s *Scanner) Window(n int) {
s.windowInSentences = n
s.confWindowInSentences = n
}

func (s *Scanner) Err() error { return s.err }
Expand All @@ -106,7 +120,7 @@ func (s *Scanner) Scan() bool {

// fill the window
func (s *Scanner) fill() (bool, error) {
wn := s.windowInSentences - len(s.window)
wn := s.confWindowInSentences - len(s.window)
for wn > 0 && s.scanner.Scan() {
txt := s.scanner.Text()
v32, err := s.embed.Embedding(context.Background(), txt)
Expand All @@ -131,12 +145,21 @@ func (s *Scanner) peek() []string {
return nil
}

// split the window into similar (a) and non-similar (b) items
a, b := make([]vector, 0), make([]vector, 0)
a = append(a, s.window[0])

for i := 1; i < len(s.window); i++ {
tail := a[len(a)-1]
if s.similarity(tail.vector, s.window[i].vector) {
var at int
switch s.confSimilarityWith {
case SIMILARITY_WITH_HEAD:
at = 0
case SIMILARITY_WITH_TAIL:
at = len(a) - 1
}
ref := a[at]

if s.confSimilarity(ref.vector, s.window[i].vector) {
a = append(a, s.window[i])
} else {
b = append(b, s.window[i])
Expand Down
16 changes: 16 additions & 0 deletions scanner/similarity.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ func Dissimilar(a, b []float32) bool {
return 0.8 < x && x <= 1.0
}

// Similarity on custom cosine distance [lo, hi].
// Use this range when you need custom interval.
func RangeSimilarity(lo, hi float32) func(a, b []float32) bool {
return func(a, b []float32) bool {
x := cosine(a, b)
return lo <= x && x <= hi
}
}

// Similarity with custom assert of cosine distance
func CosineSimilarity(f func(float32) bool) func(a, b []float32) bool {
return func(a, b []float32) bool {
return f(cosine(a, b))
}
}

func cosine(a, b []float32) (d float32) {
if len(a) != len(b) {
panic("vectors must have equal lengths")
Expand Down
76 changes: 54 additions & 22 deletions scanner/sorter.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,44 +31,67 @@ import (
// The module provides high, medium, weak and dissimilarity functions based on
// cosine distance.
type Sorter[T any] struct {
embed embeddings.Embeddings
similarity func([]float32, []float32) bool
windowInSentences int
scanner seq.Seq[T]
lens optics.Lens[T, string]
err error
eof bool
window []typed[T]
cursor []T
embed embeddings.Embeddings
confSimilarity func([]float32, []float32) bool
confWindowInSentences int
confSimilarityWith SimilarityWith
scanner seq.Seq[T]
lens optics.Lens[T, string]
err error
eof bool
window []typed[T]
cursor []T
}

// Configure similarity sorting algorithm
type SimilarityWith int

// Configure similarity sorting algorithm
const (
SIMILARITY_WITH_HEAD SimilarityWith = iota
SIMILARITY_WITH_TAIL
)

type typed[T any] struct {
object T
vector []float32
}

// Creates new instance of Sorter to read from seq.Seq[T] and using embedding.
// Creates new instance of semantic Sorter, seq.Seq[T] is source of records.
func NewSorter[T any](embed embeddings.Embeddings, lens optics.Lens[T, string], seq seq.Seq[T]) *Sorter[T] {
return &Sorter[T]{
embed: embed,
similarity: HighSimilarity,
windowInSentences: 32,
scanner: seq,
lens: lens,
window: make([]typed[T], 0),
embed: embed,
confSimilarity: HighSimilarity,
confWindowInSentences: 32,
confSimilarityWith: SIMILARITY_WITH_TAIL,
scanner: seq,
lens: lens,
window: make([]typed[T], 0),
}
}

// Similarity sets the similarity function for the Scanner.
// Similarity sets the similarity function for the Sorter.
// The default is HighSimilarity.
func (s *Sorter[T]) Similarity(f func([]float32, []float32) bool) {
s.similarity = f
s.confSimilarity = f
}

// Similarity sets the behavior to sorting algorithms.
//
// Using SIMILARITY_WITH_HEAD configures algorithm to sort chunk similar
// to the first element of chunk. The first element of chunk is stable during
// the chunk forming.
//
// Using SIMILARITY_WITH_TAIL configures algorithm to sort chunk similar
// to the last element of chunk. The last element is changed after new one is added to chunk.
func (s *Sorter[T]) SimilarityWith(x SimilarityWith) {
s.confSimilarityWith = x
}

// Widow defines the context window for similarity detection.
// The default value is 32 sentences.
func (s *Sorter[T]) Window(n int) {
s.windowInSentences = n
s.confWindowInSentences = n
}

func (s *Sorter[T]) Err() error { return s.err }
Expand All @@ -95,7 +118,7 @@ func (s *Sorter[T]) Next() bool {

// fill the window
func (s *Sorter[T]) fill() (bool, error) {
wn := s.windowInSentences - len(s.window)
wn := s.confWindowInSentences - len(s.window)

has := s.scanner != nil
for ; wn > 0 && has; has = s.scanner.Next() {
Expand All @@ -119,12 +142,21 @@ func (s *Sorter[T]) peek() []T {
return nil
}

// split the window into similar (a) and non-similar (b) items
a, b := make([]typed[T], 0), make([]typed[T], 0)
a = append(a, s.window[0])

for i := 1; i < len(s.window); i++ {
tail := a[len(a)-1]
if s.similarity(tail.vector, s.window[i].vector) {
var at int
switch s.confSimilarityWith {
case SIMILARITY_WITH_HEAD:
at = 0
case SIMILARITY_WITH_TAIL:
at = len(a) - 1
}
ref := a[at]

if s.confSimilarity(ref.vector, s.window[i].vector) {
a = append(a, s.window[i])
} else {
b = append(b, s.window[i])
Expand Down
2 changes: 1 addition & 1 deletion scanner/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@

package scanner

const Version = "scanner/v0.0.4"
const Version = "scanner/v0.0.5"
Loading