rfctr(chunking): generalize PreChunkBuilder (#2283)

To implement inter-pre-chunk overlap, we need a context that sees every pre-chunk both before and after it is accumulated (from elements). - We need access to the pre-chunk when it is completed so we can extract the "tail" overlap to be applied to the next chunk. - We need access to the as-yet-unpopulated pre-chunk so we can add the prior tail to it as a prefix. This "visibility" is split between `PreChunkBuilder` and the pre-chunker itself, which handles `TablePreChunk`s without the builder. Move `Table` element and TablePreChunk` formation into `PreChunkBuilder` such that _all_ element types (adding `Table` elements in particular) pass through it. Then `PreChunkBuilder` becomes the context we require. The actual overlap harvesting and application will come in a subsequent commit.
Unstructured-IO · Dec 18, 2023 · 0c7f64e · 0c7f64e
1 parent 9efc22c
commit 0c7f64e
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.11.6-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.11.5
 
 ### Enhancements

diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -18,6 +18,7 @@
 )
 from unstructured.documents.elements import (
     CompositeElement,
+    Element,
     ElementMetadata,
     PageBreak,
     RegexMetadata,
@@ -572,24 +573,83 @@ class DescribePreChunkBuilder:
     def it_is_empty_on_construction(self):
         builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
 
-        assert builder.text_length == 0
-        assert builder.remaining_space == 50
+        assert builder._text_length == 0
+        assert builder._remaining_space == 50
 
     def it_accumulates_elements_added_to_it(self):
         builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
 
         builder.add_element(Title("Introduction"))
-        assert builder.text_length == 12
-        assert builder.remaining_space == 136
+        assert builder._text_length == 12
+        assert builder._remaining_space == 136
 
         builder.add_element(
             Text(
                 "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
                 "lectus porta volutpat.",
             ),
         )
-        assert builder.text_length == 112
-        assert builder.remaining_space == 36
+        assert builder._text_length == 112
+        assert builder._remaining_space == 36
+
+    @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
+    def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        assert builder.will_fit(element)
+
+    @pytest.mark.parametrize(
+        ("existing_element", "next_element"),
+        [
+            (Text("abcd"), Table("Fruits\nMango")),
+            (Text("abcd"), Text("abcd " * 200)),
+            (Table("Heading\nCell text"), Table("Fruits\nMango")),
+            (Table("Heading\nCell text"), Text("abcd " * 200)),
+        ],
+    )
+    def but_not_when_it_already_contains_an_element_of_any_kind(
+        self, existing_element: Element, next_element: Element
+    ):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder.add_element(existing_element)
+
+        assert not builder.will_fit(next_element)
+
+    @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
+    def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder.add_element(Table("Heading\nCell text"))
+
+        assert not builder.will_fit(element)
+
+    def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
+        builder = PreChunkBuilder(
+            opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
+        )
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        assert not builder.will_fit(Text("In rhoncus ipsum."))
+
+    def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        # -- 55 + 2 (separator) + 44 == 101 --
+        assert not builder.will_fit(
+            Text("In rhoncus ipsum sed lectus portos volutpat.")  # 44-chars
+        )
+
+    def but_it_will_fit_an_element_that_fits(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        # -- 55 + 2 (separator) + 43 == 100 --
+        assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat."))  # 43-chars
 
     def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
         builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
@@ -611,30 +671,46 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
                 "lectus porta volutpat.",
             ),
         ]
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150
+
+    def but_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder.add_element(Table("Heading\nCell text"))
+
+        pre_chunk = next(builder.flush())
+
+        # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
+        # -- be exhausted before clearing out the old elements and a new pre-chunk can be
+        # -- accumulated immediately (first `next()` call is required however, to advance to the
+        # -- yield statement).
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150
+        # -- pre-chunk is a `TablePreChunk` --
+        assert isinstance(pre_chunk, TablePreChunk)
+        assert pre_chunk._table == Table("Heading\nCell text")
 
     def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
         builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
 
         pre_chunks = list(builder.flush())
 
         assert pre_chunks == []
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150
 
     def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
         builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
         builder.add_element(Text("abcde"))
         builder.add_element(Text("fghij"))
 
-        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+        # -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
         # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
-        assert builder.text_length == 12
-        # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
-        # -- between the current text and that of the next element if one was added.
+        assert builder._text_length == 12
+        # -- ._remaining_space is reduced by the length (2) of the trailing separator which would
+        # -- go between the current text and that of the next element if one was added.
         # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
-        assert builder.remaining_space == 36
+        assert builder._remaining_space == 36
 
 
 class DescribePreChunkCombiner:

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.5"  # pragma: no cover
+__version__ = "0.11.6-dev0"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
@@ -396,8 +396,8 @@ def _text(self) -> str:
 class PreChunkBuilder:
     """An element accumulator suitable for incrementally forming a pre-chunk.
 
-    Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
-    to determine whether it should add the next element in the element stream.
+    Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
+    the next element in the element stream.
 
     `.flush()` is used to build a PreChunk object from the accumulated elements. This method
     returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
@@ -426,7 +426,7 @@ def add_element(self, element: Element) -> None:
             self._text_segments.append(element.text)
             self._text_len += len(element.text)
 
-    def flush(self) -> Iterator[TextPreChunk]:
+    def flush(self) -> Iterator[PreChunk]:
         """Generate zero-or-one `PreChunk` object and clear the accumulator.
 
         Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
@@ -435,23 +435,62 @@ def flush(self) -> Iterator[TextPreChunk]:
         """
         if not self._elements:
             return
+
+        pre_chunk = (
+            TablePreChunk(self._elements[0], self._opts)
+            if isinstance(self._elements[0], Table)
+            # -- copy list, don't use original or it may change contents as builder proceeds --
+            else TextPreChunk(list(self._elements), self._opts)
+        )
         # -- clear builder before yield so we're not sensitive to the timing of how/when this
-        # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
-        elements = self._elements[:]
-        self._elements.clear()
-        self._text_segments.clear()
-        self._text_len = 0
-        yield TextPreChunk(elements, self._opts)
+        # -- iterator is exhausted and can add elements for the next pre-chunk immediately.
+        self._reset_state()
+        yield pre_chunk
+
+    def will_fit(self, element: Element) -> bool:
+        """True when `element` can be added to this prechunk without violating its limits.
+
+        There are several limits:
+        - A `Table` element will never fit with any other element. It will only fit in an empty
+          pre-chunk.
+        - No element will fit in a pre-chunk that already contains a `Table` element.
+        - A text-element will not fit in a pre-chunk that already exceeds the soft-max
+          (aka. new_after_n_chars).
+        - A text-element will not fit when together with the elements already present it would
+          exceed the hard-max (aka. max_characters).
+        """
+        # -- an empty pre-chunk will accept any element (including an oversized-element) --
+        if len(self._elements) == 0:
+            return True
+        # -- a `Table` will not fit in a non-empty pre-chunk --
+        if isinstance(element, Table):
+            return False
+        # -- no element will fit in a pre-chunk that already contains a `Table` element --
+        if self._elements and isinstance(self._elements[0], Table):
+            return False
+        # -- a pre-chunk that already exceeds the soft-max is considered "full" --
+        if self._text_length > self._opts.soft_max:
+            return False
+        # -- don't add an element if it would increase total size beyond the hard-max --
+        if self._remaining_space < len(element.text):
+            return False
+        return True
 
     @property
-    def remaining_space(self) -> int:
+    def _remaining_space(self) -> int:
         """Maximum text-length of an element that can be added without exceeding maxlen."""
         # -- include length of trailing separator that will go before next element text --
         separators_len = self._separator_len * len(self._text_segments)
         return self._opts.hard_max - self._text_len - separators_len
 
+    def _reset_state(self) -> None:
+        """Set working-state values back to "empty", ready to accumulate next pre-chunk."""
+        self._elements.clear()
+        self._text_segments.clear()
+        self._text_len = 0
+
     @property
-    def text_length(self) -> int:
+    def _text_length(self) -> int:
         """Length of the text in this pre-chunk.
 
         This value represents the chunk-size that would result if this pre-chunk was flushed in its
@@ -502,10 +541,16 @@ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
 
 
 class TextPreChunkAccumulator:
-    """Accumulates, measures, and combines pre-chunk objects.
+    """Accumulates, measures, and combines text pre-chunks.
+
+    Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
+    produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
+    sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
+    another pre-chunk.
 
-    Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
-    whether to add another pre-chunk.
+    Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
+    monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
+    another pre-chunk.
 
     `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
     This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used

diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
@@ -12,13 +12,8 @@
     PreChunk,
     PreChunkBuilder,
     PreChunkCombiner,
-    TablePreChunk,
-)
-from unstructured.documents.elements import (
-    Element,
-    Table,
-    Title,
 )
+from unstructured.documents.elements import Element, Title
 
 
 def chunk_by_title(
@@ -109,24 +104,17 @@ def _split_elements_by_title_and_table(
 
         # -- start new pre_chunk when necessary --
         if (
-            # -- Title and Table both start a new pre_chunk --
-            isinstance(element, (Title, Table))
-            # -- adding this element would exceed hard-maxlen for pre_chunk --
-            or pre_chunk_builder.remaining_space < len(str(element))
-            # -- pre_chunk already meets or exceeds soft-maxlen --
-            or pre_chunk_builder.text_length >= opts.soft_max
+            # -- Title starts a new "section" and so a new pre_chunk --
+            isinstance(element, Title)
+            # -- start a new pre-chunk when the WIP pre-chunk is already full --
+            or not pre_chunk_builder.will_fit(element)
             # -- a semantic boundary is indicated by metadata change since prior element --
             or metadata_differs
         ):
             # -- complete any work-in-progress pre_chunk --
             yield from pre_chunk_builder.flush()
 
-        # -- emit table and checkbox immediately since they are always isolated --
-        if isinstance(element, Table):
-            yield TablePreChunk(table=element, opts=opts)
-        # -- but accumulate text elements for consolidation into a composite chunk --
-        else:
-            pre_chunk_builder.add_element(element)
+        pre_chunk_builder.add_element(element)
 
         prior_element = element