kshard · fogfish · Feb 2, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/README.md b/README.md
@@ -1,5 +1,4 @@
 <p align="center">
-  <img src="./doc/golem.svg" height="240" />
   <h3 align="center">Embeddings</h3>
   <p align="center"><strong>adapter over various popular vector embeddings interfaces: AWS BedRock, OpenAI, word2vec</strong></p>
 

diff --git a/scanner/scanner.go b/scanner/scanner.go
@@ -36,14 +36,15 @@ import (
 //
 // Scanning stops unrecoverably at EOF or the first I/O error.
 type Scanner struct {
-	embed             embeddings.Embeddings
-	similarity        func([]float32, []float32) bool
-	windowInSentences int
-	scanner           Reader
-	err               error
-	eof               bool
-	window            []vector
-	cursor            []string
+	embed                 embeddings.Embeddings
+	confSimilarity        func([]float32, []float32) bool
+	confWindowInSentences int
+	confSimilarityWith    SimilarityWith
+	scanner               Reader
+	err                   error
+	eof                   bool
+	window                []vector
+	cursor                []string
 }
 
 // Reader is an interface similar to [bufio.Scanner].
@@ -62,24 +63,37 @@ type vector struct {
 // Creates new instance of Scanner to read from io.Reader and using embedding.
 func New(embed embeddings.Embeddings, r Reader) *Scanner {
 	return &Scanner{
-		embed:             embed,
-		similarity:        HighSimilarity,
-		windowInSentences: 32,
-		scanner:           r,
-		window:            make([]vector, 0),
+		embed:                 embed,
+		confSimilarity:        HighSimilarity,
+		confWindowInSentences: 32,
+		confSimilarityWith:    SIMILARITY_WITH_TAIL,
+		scanner:               r,
+		window:                make([]vector, 0),
 	}
 }
 
 // Similarity sets the similarity function for the Scanner.
 // The default is HighSimilarity.
 func (s *Scanner) Similarity(f func([]float32, []float32) bool) {
-	s.similarity = f
+	s.confSimilarity = f
+}
+
+// Similarity sets the behavior to sorting algorithms.
+//
+// Using SIMILARITY_WITH_HEAD configures algorithm to sort chunk similar
+// to the first element of chunk. The first element of chunk is stable during
+// the chunk forming.
+//
+// Using SIMILARITY_WITH_TAIL configures algorithm to sort chunk similar
+// to the last element of chunk. The last element is changed after new one is added to chunk.
+func (s *Scanner) SimilarityWith(x SimilarityWith) {
+	s.confSimilarityWith = x
 }
 
 // Widow defines the context window for similarity detection.
 // The default value is 32 sentences.
 func (s *Scanner) Window(n int) {
-	s.windowInSentences = n
+	s.confWindowInSentences = n
 }
 
 func (s *Scanner) Err() error     { return s.err }
@@ -106,7 +120,7 @@ func (s *Scanner) Scan() bool {
 
 // fill the window
 func (s *Scanner) fill() (bool, error) {
-	wn := s.windowInSentences - len(s.window)
+	wn := s.confWindowInSentences - len(s.window)
 	for wn > 0 && s.scanner.Scan() {
 		txt := s.scanner.Text()
 		v32, err := s.embed.Embedding(context.Background(), txt)
@@ -131,12 +145,21 @@ func (s *Scanner) peek() []string {
 		return nil
 	}
 
+	// split the window into similar (a) and non-similar (b) items
 	a, b := make([]vector, 0), make([]vector, 0)
 	a = append(a, s.window[0])
 
 	for i := 1; i < len(s.window); i++ {
-		tail := a[len(a)-1]
-		if s.similarity(tail.vector, s.window[i].vector) {
+		var at int
+		switch s.confSimilarityWith {
+		case SIMILARITY_WITH_HEAD:
+			at = 0
+		case SIMILARITY_WITH_TAIL:
+			at = len(a) - 1
+		}
+		ref := a[at]
+
+		if s.confSimilarity(ref.vector, s.window[i].vector) {
 			a = append(a, s.window[i])
 		} else {
 			b = append(b, s.window[i])

diff --git a/scanner/similarity.go b/scanner/similarity.go
@@ -42,6 +42,22 @@ func Dissimilar(a, b []float32) bool {
 	return 0.8 < x && x <= 1.0
 }
 
+// Similarity on custom cosine distance [lo, hi].
+// Use this range when you need custom interval.
+func RangeSimilarity(lo, hi float32) func(a, b []float32) bool {
+	return func(a, b []float32) bool {
+		x := cosine(a, b)
+		return lo <= x && x <= hi
+	}
+}
+
+// Similarity with custom assert of cosine distance
+func CosineSimilarity(f func(float32) bool) func(a, b []float32) bool {
+	return func(a, b []float32) bool {
+		return f(cosine(a, b))
+	}
+}
+
 func cosine(a, b []float32) (d float32) {
 	if len(a) != len(b) {
 		panic("vectors must have equal lengths")

diff --git a/scanner/sorter.go b/scanner/sorter.go
@@ -31,44 +31,67 @@ import (
 // The module provides high, medium, weak and dissimilarity functions based on
 // cosine distance.
 type Sorter[T any] struct {
-	embed             embeddings.Embeddings
-	similarity        func([]float32, []float32) bool
-	windowInSentences int
-	scanner           seq.Seq[T]
-	lens              optics.Lens[T, string]
-	err               error
-	eof               bool
-	window            []typed[T]
-	cursor            []T
+	embed                 embeddings.Embeddings
+	confSimilarity        func([]float32, []float32) bool
+	confWindowInSentences int
+	confSimilarityWith    SimilarityWith
+	scanner               seq.Seq[T]
+	lens                  optics.Lens[T, string]
+	err                   error
+	eof                   bool
+	window                []typed[T]
+	cursor                []T
 }
 
+// Configure similarity sorting algorithm
+type SimilarityWith int
+
+// Configure similarity sorting algorithm
+const (
+	SIMILARITY_WITH_HEAD SimilarityWith = iota
+	SIMILARITY_WITH_TAIL
+)
+
 type typed[T any] struct {
 	object T
 	vector []float32
 }
 
-// Creates new instance of Sorter to read from seq.Seq[T] and using embedding.
+// Creates new instance of semantic Sorter, seq.Seq[T] is source of records.
 func NewSorter[T any](embed embeddings.Embeddings, lens optics.Lens[T, string], seq seq.Seq[T]) *Sorter[T] {
 	return &Sorter[T]{
-		embed:             embed,
-		similarity:        HighSimilarity,
-		windowInSentences: 32,
-		scanner:           seq,
-		lens:              lens,
-		window:            make([]typed[T], 0),
+		embed:                 embed,
+		confSimilarity:        HighSimilarity,
+		confWindowInSentences: 32,
+		confSimilarityWith:    SIMILARITY_WITH_TAIL,
+		scanner:               seq,
+		lens:                  lens,
+		window:                make([]typed[T], 0),
 	}
 }
 
-// Similarity sets the similarity function for the Scanner.
+// Similarity sets the similarity function for the Sorter.
 // The default is HighSimilarity.
 func (s *Sorter[T]) Similarity(f func([]float32, []float32) bool) {
-	s.similarity = f
+	s.confSimilarity = f
+}
+
+// Similarity sets the behavior to sorting algorithms.
+//
+// Using SIMILARITY_WITH_HEAD configures algorithm to sort chunk similar
+// to the first element of chunk. The first element of chunk is stable during
+// the chunk forming.
+//
+// Using SIMILARITY_WITH_TAIL configures algorithm to sort chunk similar
+// to the last element of chunk. The last element is changed after new one is added to chunk.
+func (s *Sorter[T]) SimilarityWith(x SimilarityWith) {
+	s.confSimilarityWith = x
 }
 
 // Widow defines the context window for similarity detection.
 // The default value is 32 sentences.
 func (s *Sorter[T]) Window(n int) {
-	s.windowInSentences = n
+	s.confWindowInSentences = n
 }
 
 func (s *Sorter[T]) Err() error { return s.err }
@@ -95,7 +118,7 @@ func (s *Sorter[T]) Next() bool {
 
 // fill the window
 func (s *Sorter[T]) fill() (bool, error) {
-	wn := s.windowInSentences - len(s.window)
+	wn := s.confWindowInSentences - len(s.window)
 
 	has := s.scanner != nil
 	for ; wn > 0 && has; has = s.scanner.Next() {
@@ -119,12 +142,21 @@ func (s *Sorter[T]) peek() []T {
 		return nil
 	}
 
+	// split the window into similar (a) and non-similar (b) items
 	a, b := make([]typed[T], 0), make([]typed[T], 0)
 	a = append(a, s.window[0])
 
 	for i := 1; i < len(s.window); i++ {
-		tail := a[len(a)-1]
-		if s.similarity(tail.vector, s.window[i].vector) {
+		var at int
+		switch s.confSimilarityWith {
+		case SIMILARITY_WITH_HEAD:
+			at = 0
+		case SIMILARITY_WITH_TAIL:
+			at = len(a) - 1
+		}
+		ref := a[at]
+
+		if s.confSimilarity(ref.vector, s.window[i].vector) {
 			a = append(a, s.window[i])
 		} else {
 			b = append(b, s.window[i])

diff --git a/scanner/version.go b/scanner/version.go
@@ -8,4 +8,4 @@
 
 package scanner
 
-const Version = "scanner/v0.0.4"
+const Version = "scanner/v0.0.5"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,4 +8,4 @@

		package scanner

		const Version = "scanner/v0.0.4"
		const Version = "scanner/v0.0.5"