facebookresearch · mgrange1998 · Jan 27, 2026
diff --git a/privacy_guard/analysis/extraction/text_inclusion_analysis_input.py b/privacy_guard/analysis/extraction/text_inclusion_analysis_input.py
@@ -50,6 +50,7 @@ def __init__(
         lcs_bound_config: LCSBoundConfig | None = None,
         disable_word_level_longest_common_subsequence: bool = False,
         disable_char_level_longest_common_subsequence: bool = True,
+        remove_consecutive_whitespace: bool = False,
     ) -> None:
         columns = generation_df.columns.tolist()
         assert prompt_key in columns, (
@@ -78,6 +79,8 @@ def __init__(
             disable_char_level_longest_common_subsequence
         )
 
+        self.remove_consecutive_whitespace = remove_consecutive_whitespace
+
         super().__init__(df_train_user=generation_df, df_test_user=pd.DataFrame())
 
     @property

diff --git a/privacy_guard/analysis/extraction/text_inclusion_analysis_node.py b/privacy_guard/analysis/extraction/text_inclusion_analysis_node.py
@@ -138,6 +138,20 @@ def _clean_text(text: str) -> str:
     return cleaned_text
 
 
+def _clean_text_remove_consecutive_whitespace(text: str) -> str:
+    """Normalizes text.
+
+    - Lowercases
+    - Removes punctuation
+    - Turn newlines and tabs into spaces
+    - Strips leading and trailing whitespace
+    - Removes consecutive whitespace
+    """
+    cleaned_text = _clean_text(text=text)
+    cleaned_text = " ".join(cleaned_text.split())
+    return cleaned_text
+
+
 def _word_level_longest_common_subsequence_helper(
     s1: str, s2: str, autojunk: bool = True
 ) -> int:
@@ -245,9 +259,13 @@ def _char_level_longest_common_substring_helper(s1: str, s2: str) -> int:
     return max_length
 
 
-def _normalize_by_target_len(scores: pd.Series, targets: pd.Series) -> pd.Series:
+def _normalize_by_target_len(
+    scores: pd.Series,
+    targets: pd.Series,
+    clean_text_method: Callable[[str], str] = _clean_text,
+) -> pd.Series:
     """Normalized similarity metrics by target length."""
-    lengths = targets.progress_apply(lambda x: len(_clean_text(x)))
+    lengths = targets.progress_apply(lambda x: len(clean_text_method(x)))
     return scores / lengths
 
 
@@ -296,6 +314,12 @@ def __init__(self, analysis_input: TextInclusionAnalysisInput) -> None:
             self.target_set_key
         ].apply(lambda x: len(x))
 
+        self.clean_text_method = (
+            _clean_text
+            if not analysis_input.remove_consecutive_whitespace
+            else _clean_text_remove_consecutive_whitespace
+        )
+
         super().__init__(analysis_input=analysis_input)
 
     def _compute_word_level_longest_common_subsequence_helper(
@@ -307,8 +331,8 @@ def _compute_word_level_longest_common_subsequence_helper(
         Returns:
             int: Number of shared words between the two strings.
         """
-        s1 = _clean_text(row[s1_column or self.target_key])
-        s2 = _clean_text(row[s2_column or self.generation_key])
+        s1 = self.clean_text_method(row[s1_column or self.target_key])
+        s2 = self.clean_text_method(row[s2_column or self.generation_key])
         return _word_level_longest_common_subsequence_helper(s1, s2)
 
     def _compute_char_level_longest_common_subsequence_helper(
@@ -320,8 +344,8 @@ def _compute_char_level_longest_common_subsequence_helper(
         Returns:
             int: Number of shared words between the two strings.
         """
-        s1 = _clean_text(row[s1_column or self.target_key])
-        s2 = _clean_text(row[s2_column or self.generation_key])
+        s1 = self.clean_text_method(row[s1_column or self.target_key])
+        s2 = self.clean_text_method(row[s2_column or self.generation_key])
         return _char_level_longest_common_subsequence_helper(s1, s2)
 
     def _compute_edit_similarity(
@@ -336,8 +360,8 @@ def _compute_edit_similarity(
         Returns:
             int: Edit similarity between the two strings.
         """
-        s1 = _clean_text(row[s1_column or self.target_key])
-        s2 = _clean_text(row[s2_column or self.generation_key])
+        s1 = self.clean_text_method(row[s1_column or self.target_key])
+        s2 = self.clean_text_method(row[s2_column or self.generation_key])
         levenshtein = textdistance.levenshtein.similarity(s1, s2)
         return levenshtein
 
@@ -363,8 +387,8 @@ def _compute_inclusion_score(self, row: pd.Series) -> bool:
         Returns:
             bool: True if the target is included in the output_text, False otherwise.
         """
-        s1 = _clean_text(row[self.target_key])
-        s2 = _clean_text(row[self.generation_key])
+        s1 = self.clean_text_method(row[self.target_key])
+        s2 = self.clean_text_method(row[self.generation_key])
         return s1 in s2
 
     def get_compute_longest_common_substring_map(
@@ -415,11 +439,11 @@ def _compute_longest_common_substring_map(
 
             target_set = row[self.target_set_key]
 
-            comparison_text = _clean_text(row[comparison_key])
-            fp_text = _clean_text(row[false_positive_key])
+            comparison_text = self.clean_text_method(row[comparison_key])
+            fp_text = self.clean_text_method(row[false_positive_key])
 
             for target in target_set:
-                clean_target = _clean_text(target)
+                clean_target = self.clean_text_method(target)
 
                 if lcs_bound_config is not None:
                     lcs = _char_level_longest_common_substring_helper_bound(
@@ -527,7 +551,9 @@ def run_analysis(self) -> TextInclusionAnalysisNodeOutput:
                 self._compute_edit_similarity, axis=1
             )
             generation_df["edit_similarity_score"] = _normalize_by_target_len(
-                generation_df["edit_similarity"], generation_df["target"]
+                generation_df["edit_similarity"],
+                generation_df["target"],
+                self.clean_text_method,
             )
 
             outputs.edit_similarity = generation_df["edit_similarity"]

diff --git a/privacy_guard/analysis/tests/test_text_inclusion.py b/privacy_guard/analysis/tests/test_text_inclusion.py
@@ -25,6 +25,8 @@
     _char_level_longest_common_substring_helper_bound,
     _char_level_longest_common_substring_with_matched_text,
     _clean_text,
+    _clean_text_remove_consecutive_whitespace,
+    _normalize_by_target_len,
     _word_level_longest_common_subsequence_helper,
     TextInclusionAnalysisNode,
     TextInclusionAnalysisNodeOutput,
@@ -522,3 +524,87 @@ def test_longest_common_susequence_match_autojunk(self) -> None:
             _word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=True),
             0,
         )
+
+    def test_clean_text_remove_consecutive_whitespace(self) -> None:
+        # Test that consecutive whitespace is properly removed
+        text_with_consecutive_spaces = "hello  world"
+        result = _clean_text_remove_consecutive_whitespace(text_with_consecutive_spaces)
+        self.assertEqual(result, "hello world")
+
+        # Test multiple consecutive spaces
+        text_with_many_spaces = "hello    world   test"
+        result = _clean_text_remove_consecutive_whitespace(text_with_many_spaces)
+        self.assertEqual(result, "hello world test")
+
+        # Test tabs and newlines are also collapsed
+        text_with_mixed_whitespace = "hello\t\t\nworld"
+        result = _clean_text_remove_consecutive_whitespace(text_with_mixed_whitespace)
+        self.assertEqual(result, "hello world")
+
+        # Test leading/trailing whitespace is stripped
+        text_with_leading_trailing = "  hello world  "
+        result = _clean_text_remove_consecutive_whitespace(text_with_leading_trailing)
+        self.assertEqual(result, "hello world")
+
+    def test_normalize_by_target_len_respects_clean_method(self) -> None:
+        # Test that _normalize_by_target_len uses the provided clean_text_method
+        targets = pd.Series(["hello  world", "test  text"])  # consecutive spaces
+        scores = pd.Series([10.0, 8.0])
+
+        # With _clean_text, consecutive spaces are preserved
+        # "hello  world" -> "hello  world" (12 chars after cleaning)
+        result_basic = _normalize_by_target_len(scores, targets, _clean_text)
+        # len("hello  world") = 12
+        self.assertAlmostEqual(result_basic.iloc[0], 10.0 / 12.0)
+
+        # With _clean_text_remove_consecutive_whitespace, consecutive spaces are removed
+        # "hello  world" -> "hello world" (11 chars after cleaning)
+        result_no_consecutive = _normalize_by_target_len(
+            scores, targets, _clean_text_remove_consecutive_whitespace
+        )
+        # len("hello world") = 11
+        self.assertAlmostEqual(result_no_consecutive.iloc[0], 10.0 / 11.0)
+
+        # Verify the two methods produce different results
+        self.assertNotAlmostEqual(result_basic.iloc[0], result_no_consecutive.iloc[0])
+
+    def test_analysis_with_remove_consecutive_whitespace(self) -> None:
+        # Test that TextInclusionAnalysisNode respects remove_consecutive_whitespace config
+        data = {
+            "prompt": ["test  prompt"],  # consecutive spaces
+            "target": ["target  text"],  # consecutive spaces
+            "output_text": ["target text"],  # no consecutive spaces
+        }
+
+        # Without remove_consecutive_whitespace
+        analysis_input_basic = TextInclusionAnalysisInput(
+            generation_df=pd.DataFrame(data),
+            remove_consecutive_whitespace=False,
+            disable_longest_common_substring=True,
+            disable_word_level_longest_common_subsequence=True,
+            disable_char_level_longest_common_subsequence=True,
+        )
+        analysis_node_basic = TextInclusionAnalysisNode(
+            analysis_input=analysis_input_basic
+        )
+        results_basic = analysis_node_basic.compute_outputs()
+
+        # With remove_consecutive_whitespace
+        analysis_input_cleaned = TextInclusionAnalysisInput(
+            generation_df=pd.DataFrame(data),
+            remove_consecutive_whitespace=True,
+            disable_longest_common_substring=True,
+            disable_word_level_longest_common_subsequence=True,
+            disable_char_level_longest_common_subsequence=True,
+        )
+        analysis_node_cleaned = TextInclusionAnalysisNode(
+            analysis_input=analysis_input_cleaned
+        )
+        results_cleaned = analysis_node_cleaned.compute_outputs()
+
+        # edit_similarity_score should be different because target length differs
+        # when consecutive whitespace is removed vs preserved
+        self.assertNotAlmostEqual(
+            results_basic["edit_similarity_score"].iloc[0],
+            results_cleaned["edit_similarity_score"].iloc[0],
+        )