From 1230a163fda2efb7dd3937e043ec50d325f228f0 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Wed, 1 Feb 2023 14:32:16 -0500
Subject: [PATCH] feat: set a user controlled max word length for titles (#189)

* update the docs

* add option for title max word length

* bump version; update changelog

* change max length to 12

* docs updates

* to -> too
---
 CHANGELOG.md                                  |  7 +++++--
 docs/source/bricks.rst                        | 12 ++++++------
 test_unstructured/partition/test_text_type.py | 12 +++++++++++-
 unstructured/__version__.py                   |  2 +-
 unstructured/partition/text_type.py           | 18 ++++++++++++++++--
 5 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 198283039c..94a8ec70e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.4.5-dev2
+## 0.4.5-dev3
 
 * Loosen the default cap threshold to `0.5`.
-* Add a `NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling the cap ratio threshold.
+* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
+  the cap ratio threshold.
 * Unknown text elements are identified as `Text` for HTML and plain text documents.
 * `Body Text` styles no longer default to `NarrativeText` for Word documents. The style information
   is insufficient to determine that the text is narrative.
@@ -9,6 +10,8 @@
 * Adds an `Address` element for capturing elements that only contain an address.
 * Suppress the `UserWarning` when detectron is called.
 * Checks that titles and narrative test have at least one English word.
+* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
+  environment variable for controlling the max number of words in a title.
 
 ## 0.4.4
 
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
index 6727e22b1b..849cfce01d 100644
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@@ -248,7 +248,7 @@ for consideration as narrative text. The function performs the following checks
 * Text that exceeds the specified caps ratio cannot be narrative text. The threshold
   is configurable with the ``cap_threshold`` kwarg. To ignore this check, you can set
   ``cap_threshold=1.0``. You can also set the threshold by using the
-  ``NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
+  ``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
   takes precedence over the kwarg.
 * The cap ratio test does not apply to text that is all uppercase.
 
@@ -279,7 +279,10 @@ The ``is_possible_title`` function determines if a section of text is a candidat
 for consideration as a title. The function performs the following checks:
 
 * Empty text cannot be a title
-* Text that is all numeric cannot be a title
+* Text that is all numeric cannot be a title.
+* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
+  the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
+  variable takes precedence over the kwarg.
 * Narrative text must contain at least one English word (if ``language`` is set to "en")
 * If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
 * If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
@@ -379,10 +382,7 @@ Examples:
 Determines if the section of text exceeds the specified caps ratio. Used in
 ``is_possible_narrative_text`` and ``is_possible_title``, but can be used independently
 as well. You can set the caps threshold using the ``threshold`` kwarg. The threshold
-defaults to ``0.3``. Only runs on sections of text that are a single sentence.
-You can also set the threshold using the ``NARRATIVE_TEXT_CAP_THRESHOLD`` environment
-variable. The environment variable takes precedence over the kwarg. The caps ratio
-check does not apply to text that is all capitalized.
+defaults to ``0.3``. Only runs on sections of text that are a single sentence. The caps ratio check does not apply to text that is all capitalized.
 
 Examples:
 
diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py
index 693e86a42a..6503365185 100644
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@@ -62,6 +62,7 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
         ("ITEM 1A. RISK FACTORS", True),  # Two "sentences", but both are short
         ("To My Dearest Friends,", False),  # Ends with a comma
         ("BTAR ADFJA L", False),  # Doesn't have english words
+        ("ITEM 1A. RISK FACTORS " * 15, False),  # Title is too long
     ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
@@ -168,7 +169,7 @@ def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
 def test_set_caps_ratio_with_environment_variable(monkeypatch):
     monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
     monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
-    monkeypatch.setenv("NARRATIVE_TEXT_CAP_THRESHOLD", 0.8)
+    monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", 0.8)
 
     text = "All The King's Horses. And All The King's Men."
     with patch.object(text_type, "exceeds_cap_ratio", return_value=False) as mock_exceeds:
@@ -177,6 +178,15 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
     mock_exceeds.assert_called_once_with(text, threshold=0.8)
 
 
+def test_set_title_max_word_length_with_environment_variable(monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    monkeypatch.setenv("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", 5)
+
+    text = "Intellectual Property in the United States"
+    assert text_type.is_possible_narrative_text(text) is False
+
+
 def test_sentence_count(monkeypatch):
     monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
     text = "Hi my name is Matt. I work with Crag."
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3732ca24f7..3d7d3cefed 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.5-dev2"  # pragma: no cover
+__version__ = "0.4.5-dev3"  # pragma: no cover
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
index c5bae1303b..8ed326f9d5 100644
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@@ -47,7 +47,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
 
     # NOTE(robinson): it gets read in from the environment as a string so we need to
     # cast it to a float
-    cap_threshold = float(os.environ.get("NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold))
+    cap_threshold = float(
+        os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold)
+    )
     if exceeds_cap_ratio(text, threshold=cap_threshold):
         logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
         return False
@@ -59,7 +61,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
     return True
 
 
-def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "en") -> bool:
+def is_possible_title(
+    text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
+) -> bool:
     """Checks to see if the text passes all of the checks for a valid title.
 
     Parameters
@@ -68,6 +72,8 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "
         the input text
     sentence_min_length
         the minimum number of words required to consider a section of text a sentence
+    title_max_word_length
+        the maximum number of words a title can contain
     language
         the two letter language code for the text. defaults to "en" for English
     """
@@ -75,6 +81,14 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "
         logger.debug("Not a title. Text is empty.")
         return False
 
+    title_max_word_length = int(
+        os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length)
+    )
+    # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
+    # is less expensive and actual tokenization doesn't add much value for the length check
+    if len(text.split(" ")) > title_max_word_length:
+        return False
+
     # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
     if text.endswith(","):
         return False