YnRen22852
diff --git a/‎tutorials/semantic_split/output_10_0.png
-258 KB b/‎tutorials/semantic_split/output_10_0.png
-258 KB
diff --git a/‎tutorials/semantic_split/output_10_1.png
258 KB b/‎tutorials/semantic_split/output_10_1.png
258 KB
diff --git a/‎tutorials/semantic_split/output_14_1.png
748 Bytes b/‎tutorials/semantic_split/output_14_1.png
748 Bytes
diff --git a/‎tutorials/semantic_split/output_5_0.png
2.87 KB b/‎tutorials/semantic_split/output_5_0.png
2.87 KB
diff --git a/‎tutorials/semantic_split/semantic_split.md
+100-223 b/‎tutorials/semantic_split/semantic_split.md
+100-223
diff --git a/‎wordllama/algorithms/semantic_splitter.py
+102 b/‎wordllama/algorithms/semantic_splitter.py
+102
diff --git a/‎wordllama/inference.py
+43 b/‎wordllama/inference.py
+43
@@ -0,0 +1,102 @@
+import numpy as np
+from typing import List
+from itertools import chain
+from .find_local_minima import window_average, find_local_minima
+from .splitter import constrained_coalesce, split_sentences
+
+
+class SemanticSplitter:
+    """A class for semantically splitting and reconstructing text."""
+
+    @staticmethod
+    def flatten(nested_list: List[List]) -> List:
+        """Flatten a list of lists into a single list."""
+        return list(chain.from_iterable(nested_list))
+
+    @staticmethod
+    def constrained_split(text: str, target_size: int) -> List[str]:
+        """
+        Split text into chunks of approximately target_size.
+
+        Parameters:
+        - text (str): The text to split.
+        - target_size (int): The target size for each chunk.
+
+        Returns:
+        - List[str]: List of text chunks.
+        """
+        sentences = split_sentences(text)
+        return constrained_coalesce(sentences, target_size, separator=" ")
+
+    @classmethod
+    def split(cls, text: str, target_size: int, initial_split_size: int) -> List[str]:
+        """
+        Split the input text into chunks.
+
+        Parameters:
+        - text (str): The input text to split.
+        - target_size (int): The target size for final chunks.
+        - initial_split_size (int): The initial size for splitting on newlines.
+
+        Returns:
+        - List[str]: List of text chunks.
+        """
+        lines = constrained_coalesce(
+            text.splitlines(), initial_split_size, separator="\n"
+        )
+        chunks = [
+            cls.constrained_split(line, target_size)
+            if len(line) > target_size
+            else [line]
+            for line in lines
+        ]
+        chunks = cls.flatten(chunks)
+        return [chunk for chunk in chunks if chunk.strip()]
+
+    @classmethod
+    def reconstruct(
+        cls,
+        lines: List[str],
+        x_sim: np.ndarray,
+        target_size: int,
+        window_size: int,
+        poly_order: int,
+        savgol_window: int,
+        max_score_pct: float = 0.4,
+    ) -> List[str]:
+        """
+        Reconstruct text chunks based on semantic similarity.
+
+        Parameters:
+        - lines (List[str]): List of text chunks to reconstruct.
+        - x_sim (np.ndarray): Cross-similarity matrix of text chunks.
+        - target_size (int): Target size for final chunks.
+        - window_size (int): Window size for similarity matrix averaging.
+        - poly_order (int): Polynomial order for Savitzky-Golay filter.
+        - savgol_window (int): Window size for Savitzky-Golay filter.
+
+        Returns:
+        - List[str]: List of semantically split text chunks.
+        """
+        sim_avg = window_average(x_sim, window_size)
+        x = np.arange(len(sim_avg))
+        roots, y = find_local_minima(
+            x, sim_avg, poly_order=poly_order, window_size=savgol_window
+        )
+        split_points = np.round(roots).astype(int).tolist()
+
+        # filter to minima within bottom 40% of similarity scores
+        (x_idx,) = np.where(y < np.quantile(sim_avg, max_score_pct))
+        split_points = [x for i, x in enumerate(split_points) if i in x_idx]
+
+        # reconstruct using the minima as boundaries for coalesce
+        # this ensures that any semantic boundaries are respected
+        chunks = []
+        start = 0
+        for end in split_points + [len(lines)]:
+            chunk = constrained_coalesce(lines[start:end], target_size)
+            chunks.extend(chunk)
+            start = end
+
+        chunks = constrained_coalesce(chunks, target_size)
+        return chunks
@@ -9,6 +9,7 @@
     binarize_and_packbits,
     process_batches_cy,
 )
+from .algorithms.semantic_splitter import SemanticSplitter
 from .config import WordLlamaConfig
 
 # Set up logging
@@ -370,3 +371,45 @@ def cluster(
             random_state=random_state,
         )
         return cluster_labels, inertia
+
+    def split(
+        self,
+        text: str,
+        target_size: int = 1536,
+        window_size: int = 3,
+        initial_split_size: int = 64,
+        poly_order: int = 2,
+        savgol_window: int = 7,
+    ) -> List[str]:
+        """
+        Perform semantic splitting on the input text.
+
+        Parameters:
+        - text (str): The input text to split.
+        - target_size (int): Target size for text chunks.
+        - window_size (int): Window size for similarity matrix averaging.
+        - initial_split_size (int): Initial size for splitting on newlines.
+        - poly_order (int): Polynomial order for Savitzky-Golay filter.
+        - savgol_window (int): Window size for Savitzky-Golay filter.
+
+        Returns:
+        - List[str]: List of semantically split text chunks.
+        """
+        # split text
+        lines = SemanticSplitter.split(
+            text, target_size=target_size, initial_split_size=initial_split_size
+        )
+
+        # compute cross similarity
+        embeddings = self.embed(lines)
+        cross_similarity = self.vector_similarity(embeddings, embeddings)
+
+        # reconstruct text with similarity signals
+        return SemanticSplitter.reconstruct(
+            lines,
+            cross_similarity,
+            target_size=target_size,
+            window_size=window_size,
+            poly_order=poly_order,
+            savgol_window=savgol_window,
+        )