Skip to content

Commit

Permalink
rfctr(chunking): generalize PreChunkBuilder (#2283)
Browse files Browse the repository at this point in the history
To implement inter-pre-chunk overlap, we need a context that sees every
pre-chunk both before and after it is accumulated (from elements).

- We need access to the pre-chunk when it is completed so we can extract
the "tail" overlap to be applied to the next chunk.
- We need access to the as-yet-unpopulated pre-chunk so we can add the
prior tail to it as a prefix.

This "visibility" is split between `PreChunkBuilder` and the pre-chunker
itself, which handles `TablePreChunk`s without the builder.

Move `Table` element and TablePreChunk` formation into `PreChunkBuilder`
such that _all_ element types (adding `Table` elements in particular)
pass through it. Then `PreChunkBuilder` becomes the context we require.

The actual overlap harvesting and application will come in a subsequent
commit.
  • Loading branch information
scanny authored Dec 18, 2023
1 parent 9efc22c commit 0c7f64e
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 48 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.11.6-dev0

### Enhancements

### Features

### Fixes

## 0.11.5

### Enhancements
Expand Down
106 changes: 91 additions & 15 deletions test_unstructured/chunking/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
from unstructured.documents.elements import (
CompositeElement,
Element,
ElementMetadata,
PageBreak,
RegexMetadata,
Expand Down Expand Up @@ -572,24 +573,83 @@ class DescribePreChunkBuilder:
def it_is_empty_on_construction(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))

assert builder.text_length == 0
assert builder.remaining_space == 50
assert builder._text_length == 0
assert builder._remaining_space == 50

def it_accumulates_elements_added_to_it(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))

builder.add_element(Title("Introduction"))
assert builder.text_length == 12
assert builder.remaining_space == 136
assert builder._text_length == 12
assert builder._remaining_space == 136

builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
)
assert builder.text_length == 112
assert builder.remaining_space == 36
assert builder._text_length == 112
assert builder._remaining_space == 36

@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
assert builder.will_fit(element)

@pytest.mark.parametrize(
("existing_element", "next_element"),
[
(Text("abcd"), Table("Fruits\nMango")),
(Text("abcd"), Text("abcd " * 200)),
(Table("Heading\nCell text"), Table("Fruits\nMango")),
(Table("Heading\nCell text"), Text("abcd " * 200)),
],
)
def but_not_when_it_already_contains_an_element_of_any_kind(
self, existing_element: Element, next_element: Element
):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder.add_element(existing_element)

assert not builder.will_fit(next_element)

@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder.add_element(Table("Heading\nCell text"))

assert not builder.will_fit(element)

def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
builder = PreChunkBuilder(
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
)
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)

assert not builder.will_fit(Text("In rhoncus ipsum."))

def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)

# -- 55 + 2 (separator) + 44 == 101 --
assert not builder.will_fit(
Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
)

def but_it_will_fit_an_element_that_fits(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)

# -- 55 + 2 (separator) + 43 == 100 --
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars

def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
Expand All @@ -611,30 +671,46 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
"lectus porta volutpat.",
),
]
assert builder.text_length == 0
assert builder.remaining_space == 150
assert builder._text_length == 0
assert builder._remaining_space == 150

def but_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Table("Heading\nCell text"))

pre_chunk = next(builder.flush())

# -- pre-chunk builder was reset before the yield, such that the iterator does not need to
# -- be exhausted before clearing out the old elements and a new pre-chunk can be
# -- accumulated immediately (first `next()` call is required however, to advance to the
# -- yield statement).
assert builder._text_length == 0
assert builder._remaining_space == 150
# -- pre-chunk is a `TablePreChunk` --
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")

def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))

pre_chunks = list(builder.flush())

assert pre_chunks == []
assert builder.text_length == 0
assert builder.remaining_space == 150
assert builder._text_length == 0
assert builder._remaining_space == 150

def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))

# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert builder.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
# -- between the current text and that of the next element if one was added.
assert builder._text_length == 12
# -- ._remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
assert builder._remaining_space == 36


class DescribePreChunkCombiner:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.5" # pragma: no cover
__version__ = "0.11.6-dev0" # pragma: no cover
73 changes: 59 additions & 14 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,8 @@ def _text(self) -> str:
class PreChunkBuilder:
"""An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream.
Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
the next element in the element stream.
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
Expand Down Expand Up @@ -426,7 +426,7 @@ def add_element(self, element: Element) -> None:
self._text_segments.append(element.text)
self._text_len += len(element.text)

def flush(self) -> Iterator[TextPreChunk]:
def flush(self) -> Iterator[PreChunk]:
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
Expand All @@ -435,23 +435,62 @@ def flush(self) -> Iterator[TextPreChunk]:
"""
if not self._elements:
return

pre_chunk = (
TablePreChunk(self._elements[0], self._opts)
if isinstance(self._elements[0], Table)
# -- copy list, don't use original or it may change contents as builder proceeds --
else TextPreChunk(list(self._elements), self._opts)
)
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:]
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
yield TextPreChunk(elements, self._opts)
# -- iterator is exhausted and can add elements for the next pre-chunk immediately.
self._reset_state()
yield pre_chunk

def will_fit(self, element: Element) -> bool:
"""True when `element` can be added to this prechunk without violating its limits.
There are several limits:
- A `Table` element will never fit with any other element. It will only fit in an empty
pre-chunk.
- No element will fit in a pre-chunk that already contains a `Table` element.
- A text-element will not fit in a pre-chunk that already exceeds the soft-max
(aka. new_after_n_chars).
- A text-element will not fit when together with the elements already present it would
exceed the hard-max (aka. max_characters).
"""
# -- an empty pre-chunk will accept any element (including an oversized-element) --
if len(self._elements) == 0:
return True
# -- a `Table` will not fit in a non-empty pre-chunk --
if isinstance(element, Table):
return False
# -- no element will fit in a pre-chunk that already contains a `Table` element --
if self._elements and isinstance(self._elements[0], Table):
return False
# -- a pre-chunk that already exceeds the soft-max is considered "full" --
if self._text_length > self._opts.soft_max:
return False
# -- don't add an element if it would increase total size beyond the hard-max --
if self._remaining_space < len(element.text):
return False
return True

@property
def remaining_space(self) -> int:
def _remaining_space(self) -> int:
"""Maximum text-length of an element that can be added without exceeding maxlen."""
# -- include length of trailing separator that will go before next element text --
separators_len = self._separator_len * len(self._text_segments)
return self._opts.hard_max - self._text_len - separators_len

def _reset_state(self) -> None:
"""Set working-state values back to "empty", ready to accumulate next pre-chunk."""
self._elements.clear()
self._text_segments.clear()
self._text_len = 0

@property
def text_length(self) -> int:
def _text_length(self) -> int:
"""Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this pre-chunk was flushed in its
Expand Down Expand Up @@ -502,10 +541,16 @@ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:


class TextPreChunkAccumulator:
"""Accumulates, measures, and combines pre-chunk objects.
"""Accumulates, measures, and combines text pre-chunks.
Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
another pre-chunk.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another pre-chunk.
Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
another pre-chunk.
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
Expand Down
24 changes: 6 additions & 18 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,8 @@
PreChunk,
PreChunkBuilder,
PreChunkCombiner,
TablePreChunk,
)
from unstructured.documents.elements import (
Element,
Table,
Title,
)
from unstructured.documents.elements import Element, Title


def chunk_by_title(
Expand Down Expand Up @@ -109,24 +104,17 @@ def _split_elements_by_title_and_table(

# -- start new pre_chunk when necessary --
if (
# -- Title and Table both start a new pre_chunk --
isinstance(element, (Title, Table))
# -- adding this element would exceed hard-maxlen for pre_chunk --
or pre_chunk_builder.remaining_space < len(str(element))
# -- pre_chunk already meets or exceeds soft-maxlen --
or pre_chunk_builder.text_length >= opts.soft_max
# -- Title starts a new "section" and so a new pre_chunk --
isinstance(element, Title)
# -- start a new pre-chunk when the WIP pre-chunk is already full --
or not pre_chunk_builder.will_fit(element)
# -- a semantic boundary is indicated by metadata change since prior element --
or metadata_differs
):
# -- complete any work-in-progress pre_chunk --
yield from pre_chunk_builder.flush()

# -- emit table and checkbox immediately since they are always isolated --
if isinstance(element, Table):
yield TablePreChunk(table=element, opts=opts)
# -- but accumulate text elements for consolidation into a composite chunk --
else:
pre_chunk_builder.add_element(element)
pre_chunk_builder.add_element(element)

prior_element = element

Expand Down

0 comments on commit 0c7f64e

Please sign in to comment.