From 1230a163fda2efb7dd3937e043ec50d325f228f0 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 1 Feb 2023 14:32:16 -0500 Subject: [PATCH] feat: set a user controlled max word length for titles (#189) * update the docs * add option for title max word length * bump version; update changelog * change max length to 12 * docs updates * to -> too --- CHANGELOG.md | 7 +++++-- docs/source/bricks.rst | 12 ++++++------ test_unstructured/partition/test_text_type.py | 12 +++++++++++- unstructured/__version__.py | 2 +- unstructured/partition/text_type.py | 18 ++++++++++++++++-- 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 198283039c..94a8ec70e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.4.5-dev2 +## 0.4.5-dev3 * Loosen the default cap threshold to `0.5`. -* Add a `NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling the cap ratio threshold. +* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling + the cap ratio threshold. * Unknown text elements are identified as `Text` for HTML and plain text documents. * `Body Text` styles no longer default to `NarrativeText` for Word documents. The style information is insufficient to determine that the text is narrative. @@ -9,6 +10,8 @@ * Adds an `Address` element for capturing elements that only contain an address. * Suppress the `UserWarning` when detectron is called. * Checks that titles and narrative test have at least one English word. +* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH` + environment variable for controlling the max number of words in a title. ## 0.4.4 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 6727e22b1b..849cfce01d 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -248,7 +248,7 @@ for consideration as narrative text. The function performs the following checks * Text that exceeds the specified caps ratio cannot be narrative text. The threshold is configurable with the ``cap_threshold`` kwarg. To ignore this check, you can set ``cap_threshold=1.0``. You can also set the threshold by using the - ``NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable + ``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable takes precedence over the kwarg. * The cap ratio test does not apply to text that is all uppercase. @@ -279,7 +279,10 @@ The ``is_possible_title`` function determines if a section of text is a candidat for consideration as a title. The function performs the following checks: * Empty text cannot be a title -* Text that is all numeric cannot be a title +* Text that is all numeric cannot be a title. +* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with + the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment + variable takes precedence over the kwarg. * Narrative text must contain at least one English word (if ``language`` is set to "en") * If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5. * If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles. @@ -379,10 +382,7 @@ Examples: Determines if the section of text exceeds the specified caps ratio. Used in ``is_possible_narrative_text`` and ``is_possible_title``, but can be used independently as well. You can set the caps threshold using the ``threshold`` kwarg. The threshold -defaults to ``0.3``. Only runs on sections of text that are a single sentence. -You can also set the threshold using the ``NARRATIVE_TEXT_CAP_THRESHOLD`` environment -variable. The environment variable takes precedence over the kwarg. The caps ratio -check does not apply to text that is all capitalized. +defaults to ``0.3``. Only runs on sections of text that are a single sentence. The caps ratio check does not apply to text that is all capitalized. Examples: diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index 693e86a42a..6503365185 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -62,6 +62,7 @@ def test_is_possible_narrative_text(text, expected, monkeypatch): ("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short ("To My Dearest Friends,", False), # Ends with a comma ("BTAR ADFJA L", False), # Doesn't have english words + ("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long ], ) def test_is_possible_title(text, expected, monkeypatch): @@ -168,7 +169,7 @@ def test_contains_exceeds_cap_ratio(text, expected, monkeypatch): def test_set_caps_ratio_with_environment_variable(monkeypatch): monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) - monkeypatch.setenv("NARRATIVE_TEXT_CAP_THRESHOLD", 0.8) + monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", 0.8) text = "All The King's Horses. And All The King's Men." with patch.object(text_type, "exceeds_cap_ratio", return_value=False) as mock_exceeds: @@ -177,6 +178,15 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch): mock_exceeds.assert_called_once_with(text, threshold=0.8) +def test_set_title_max_word_length_with_environment_variable(monkeypatch): + monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) + monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) + monkeypatch.setenv("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", 5) + + text = "Intellectual Property in the United States" + assert text_type.is_possible_narrative_text(text) is False + + def test_sentence_count(monkeypatch): monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) text = "Hi my name is Matt. I work with Crag." diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3732ca24f7..3d7d3cefed 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.5-dev2" # pragma: no cover +__version__ = "0.4.5-dev3" # pragma: no cover diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index c5bae1303b..8ed326f9d5 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -47,7 +47,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: # NOTE(robinson): it gets read in from the environment as a string so we need to # cast it to a float - cap_threshold = float(os.environ.get("NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold)) + cap_threshold = float( + os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold) + ) if exceeds_cap_ratio(text, threshold=cap_threshold): logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") return False @@ -59,7 +61,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: return True -def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "en") -> bool: +def is_possible_title( + text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en" +) -> bool: """Checks to see if the text passes all of the checks for a valid title. Parameters @@ -68,6 +72,8 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = " the input text sentence_min_length the minimum number of words required to consider a section of text a sentence + title_max_word_length + the maximum number of words a title can contain language the two letter language code for the text. defaults to "en" for English """ @@ -75,6 +81,14 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = " logger.debug("Not a title. Text is empty.") return False + title_max_word_length = int( + os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length) + ) + # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it + # is less expensive and actual tokenization doesn't add much value for the length check + if len(text.split(" ")) > title_max_word_length: + return False + # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles if text.endswith(","): return False