1
1
import numpy as np
2
- from typing import List , Tuple
2
+ from typing import List , Tuple , Optional , Union
3
3
from itertools import chain
4
4
from .find_local_minima import find_local_minima , windowed_cross_similarity
5
- from .splitter import constrained_coalesce , split_sentences
5
+ from .splitter import (
6
+ constrained_batches ,
7
+ constrained_coalesce ,
8
+ split_sentences ,
9
+ reverse_merge ,
10
+ )
6
11
7
12
8
13
class SemanticSplitter :
9
- """A class for semantically splitting and reconstructing text."""
14
+ """
15
+ A class for semantically splitting text.
16
+
17
+ This class provides methods to split text into chunks based on semantic similarity
18
+ and reconstruct them while maintaining semantic coherence.
19
+ """
10
20
11
21
@staticmethod
12
- def flatten (nested_list : List [List ]) -> List :
13
- """Flatten a list of lists into a single list."""
22
+ def flatten (nested_list : List [List [any ]]) -> List [any ]:
23
+ """
24
+ Flatten a list of lists into a single list.
25
+
26
+ Args:
27
+ nested_list (List[List[any]]): A list of lists to be flattened.
28
+
29
+ Returns:
30
+ List[any]: A flattened list containing all elements from the nested lists.
31
+ """
14
32
return list (chain .from_iterable (nested_list ))
15
33
16
34
@staticmethod
17
35
def constrained_split (
18
36
text : str ,
19
37
target_size : int ,
20
- coalesce_range : Tuple [int , int , int ] = (256 , 576 , 64 ),
21
38
separator : str = " " ,
39
+ min_size : int = 24 ,
22
40
) -> List [str ]:
23
41
"""
24
42
Split text into chunks of approximately target_size.
25
43
26
- Parameters:
27
- - text (str): The text to split.
28
- - target_size (int): The target size for each chunk.
44
+ Args:
45
+ text (str): The text to split.
46
+ target_size (int): The target size for each chunk.
47
+ separator (str, optional): The separator to use when joining text. Defaults to " ".
48
+ min_size (int, optional): The minimum size for each chunk. Defaults to 24.
29
49
30
50
Returns:
31
- - List[str]: List of text chunks.
51
+ List[str]: List of text chunks.
32
52
"""
33
53
sentences = split_sentences (text )
34
- for i in range ( * coalesce_range ):
35
- sentences = constrained_coalesce (sentences , i , separator = separator )
54
+ sentences = constrained_coalesce ( sentences , target_size , separator = separator )
55
+ sentences = reverse_merge (sentences , n = min_size , separator = separator )
36
56
return sentences
37
57
38
58
@classmethod
39
59
def split (
40
60
cls ,
41
61
text : str ,
42
62
target_size : int ,
43
- paragraph_range : Tuple [ int , int , int ] = ( 16 , 60 , 8 ) ,
44
- sentence_range : Tuple [ int , int , int ] = ( 256 , 576 , 64 ) ,
63
+ cleanup_size : int = 24 ,
64
+ intermediate_size : int = 96 ,
45
65
) -> List [str ]:
46
66
"""
47
- Split the input text into chunks.
67
+ Split the input text into chunks based on semantic coherence .
48
68
49
- Parameters:
50
- - text (str): The input text to split.
51
- - target_size (int): The target size for final chunks.
52
- - initial_split_size (int): The initial size for splitting on newlines.
69
+ Args:
70
+ text (str): The input text to split.
71
+ target_size (int): The target size for final chunks.
72
+ cleanup_size (int, optional): The minimum size for cleaning up small chunks. Defaults to 24.
73
+ intermediate_size (int, optional): The initial size for splitting on newlines. Defaults to 96.
53
74
54
75
Returns:
55
- - List[str]: List of text chunks.
76
+ List[str]: List of text chunks.
56
77
"""
57
- # paragraph splitting
58
- # split on newlines and coalesce to cleanup
59
78
lines = text .splitlines ()
60
- for i in range ( * paragraph_range ):
61
- lines = constrained_coalesce (lines , i , separator = "\n " )
79
+ lines = constrained_coalesce ( lines , intermediate_size , separator = " \n " )
80
+ lines = reverse_merge (lines , n = cleanup_size , separator = "\n " )
62
81
63
- # for paragraphs larger than target_size
64
- # split to sentences and coalesce
65
82
chunks = [
66
83
cls .constrained_split (
67
- line , target_size , coalesce_range = sentence_range , separator = " "
84
+ line , target_size , min_size = cleanup_size , separator = " "
68
85
)
69
86
if len (line ) > target_size
70
87
else [line ]
71
88
for line in lines
72
89
]
73
90
74
- # flatten list of lists
75
91
chunks = cls .flatten (chunks )
76
- return list (filter (lambda x : True if x .strip () else False , chunks ))
92
+ return list (filter (lambda x : bool ( x .strip ()) , chunks ))
77
93
78
94
@classmethod
79
95
def reconstruct (
@@ -85,46 +101,51 @@ def reconstruct(
85
101
poly_order : int ,
86
102
savgol_window : int ,
87
103
max_score_pct : float = 0.4 ,
88
- ) -> List [str ]:
104
+ return_minima : bool = False ,
105
+ ) -> Union [List [str ], Tuple [np .ndarray , np .ndarray , np .ndarray ]]:
89
106
"""
90
107
Reconstruct text chunks based on semantic similarity.
91
108
92
- Parameters:
93
- - lines (List[str]): List of text chunks to reconstruct.
94
- - norm_embed (np.ndarray): Embeddings (normalized).
95
- - target_size (int): Target size for final chunks.
96
- - window_size (int): Window size for similarity matrix averaging.
97
- - poly_order (int): Polynomial order for Savitzky-Golay filter.
98
- - savgol_window (int): Window size for Savitzky-Golay filter.
109
+ Args:
110
+ lines (List[str]): List of text chunks to reconstruct.
111
+ norm_embed (np.ndarray): Normalized embeddings of the text chunks.
112
+ target_size (int): Target size for final chunks.
113
+ window_size (int): Window size for similarity matrix averaging.
114
+ poly_order (int): Polynomial order for Savitzky-Golay filter.
115
+ savgol_window (int): Window size for Savitzky-Golay filter.
116
+ max_score_pct (float, optional): Maximum percentile of similarity scores to consider. Defaults to 0.4.
117
+ return_minima (bool, optional): If True, return minima information instead of reconstructed text. Defaults to False.
99
118
100
119
Returns:
101
- - List[str]: List of semantically split text chunks.
120
+ Union[List[str], Tuple[np.ndarray, np.ndarray, np.ndarray]]:
121
+ If return_minima is False, returns a list of reconstructed text chunks.
122
+ If return_minima is True, returns a tuple of (roots, y, sim_avg).
123
+
124
+ Raises:
125
+ AssertionError: If the number of texts doesn't equal the number of embeddings.
102
126
"""
103
127
assert (
104
128
len (lines ) == norm_embed .shape [0 ]
105
129
), "Number of texts must equal number of embeddings"
106
130
107
- # calculate the similarity for the window
108
131
sim_avg = windowed_cross_similarity (norm_embed , window_size )
109
-
110
- # find the minima
111
132
roots , y = find_local_minima (
112
133
sim_avg , poly_order = poly_order , window_size = savgol_window
113
134
)
114
- split_points = np .round (roots ).astype (int ).tolist ()
115
135
116
- # filter to minima within bottom Nth percentile of similarity scores
136
+ if return_minima :
137
+ return roots , y , sim_avg
138
+
117
139
(x_idx ,) = np .where (y < np .quantile (sim_avg , max_score_pct ))
118
- split_points = [x for i , x in enumerate (split_points ) if i in x_idx ]
140
+ split_points = [int ( x ) for i , x in enumerate (roots . tolist () ) if i in x_idx ]
119
141
120
- # reconstruct using the minima as boundaries for coalesce
121
- # this ensures that any semantic boundaries are respected
122
142
chunks = []
123
143
start = 0
124
144
for end in split_points + [len (lines )]:
125
145
chunk = constrained_coalesce (lines [start :end ], target_size )
126
146
chunks .extend (chunk )
127
147
start = end
128
148
129
- chunks = constrained_coalesce (chunks , target_size )
130
- return chunks
149
+ return list (
150
+ map ("" .join , constrained_batches (lines , max_size = target_size , strict = False ))
151
+ )
0 commit comments