Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
lcs_bound_config: LCSBoundConfig | None = None,
disable_word_level_longest_common_subsequence: bool = False,
disable_char_level_longest_common_subsequence: bool = True,
remove_consecutive_whitespace: bool = False,
) -> None:
columns = generation_df.columns.tolist()
assert prompt_key in columns, (
Expand Down Expand Up @@ -78,6 +79,8 @@ def __init__(
disable_char_level_longest_common_subsequence
)

self.remove_consecutive_whitespace = remove_consecutive_whitespace

super().__init__(df_train_user=generation_df, df_test_user=pd.DataFrame())

@property
Expand Down
54 changes: 40 additions & 14 deletions privacy_guard/analysis/extraction/text_inclusion_analysis_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,20 @@ def _clean_text(text: str) -> str:
return cleaned_text


def _clean_text_remove_consecutive_whitespace(text: str) -> str:
"""Normalizes text.

- Lowercases
- Removes punctuation
- Turn newlines and tabs into spaces
- Strips leading and trailing whitespace
- Removes consecutive whitespace
"""
cleaned_text = _clean_text(text=text)
cleaned_text = " ".join(cleaned_text.split())
return cleaned_text


def _word_level_longest_common_subsequence_helper(
s1: str, s2: str, autojunk: bool = True
) -> int:
Expand Down Expand Up @@ -245,9 +259,13 @@ def _char_level_longest_common_substring_helper(s1: str, s2: str) -> int:
return max_length


def _normalize_by_target_len(scores: pd.Series, targets: pd.Series) -> pd.Series:
def _normalize_by_target_len(
scores: pd.Series,
targets: pd.Series,
clean_text_method: Callable[[str], str] = _clean_text,
) -> pd.Series:
"""Normalized similarity metrics by target length."""
lengths = targets.progress_apply(lambda x: len(_clean_text(x)))
lengths = targets.progress_apply(lambda x: len(clean_text_method(x)))
return scores / lengths


Expand Down Expand Up @@ -296,6 +314,12 @@ def __init__(self, analysis_input: TextInclusionAnalysisInput) -> None:
self.target_set_key
].apply(lambda x: len(x))

self.clean_text_method = (
_clean_text
if not analysis_input.remove_consecutive_whitespace
else _clean_text_remove_consecutive_whitespace
)

super().__init__(analysis_input=analysis_input)

def _compute_word_level_longest_common_subsequence_helper(
Expand All @@ -307,8 +331,8 @@ def _compute_word_level_longest_common_subsequence_helper(
Returns:
int: Number of shared words between the two strings.
"""
s1 = _clean_text(row[s1_column or self.target_key])
s2 = _clean_text(row[s2_column or self.generation_key])
s1 = self.clean_text_method(row[s1_column or self.target_key])
s2 = self.clean_text_method(row[s2_column or self.generation_key])
return _word_level_longest_common_subsequence_helper(s1, s2)

def _compute_char_level_longest_common_subsequence_helper(
Expand All @@ -320,8 +344,8 @@ def _compute_char_level_longest_common_subsequence_helper(
Returns:
int: Number of shared words between the two strings.
"""
s1 = _clean_text(row[s1_column or self.target_key])
s2 = _clean_text(row[s2_column or self.generation_key])
s1 = self.clean_text_method(row[s1_column or self.target_key])
s2 = self.clean_text_method(row[s2_column or self.generation_key])
return _char_level_longest_common_subsequence_helper(s1, s2)

def _compute_edit_similarity(
Expand All @@ -336,8 +360,8 @@ def _compute_edit_similarity(
Returns:
int: Edit similarity between the two strings.
"""
s1 = _clean_text(row[s1_column or self.target_key])
s2 = _clean_text(row[s2_column or self.generation_key])
s1 = self.clean_text_method(row[s1_column or self.target_key])
s2 = self.clean_text_method(row[s2_column or self.generation_key])
levenshtein = textdistance.levenshtein.similarity(s1, s2)
return levenshtein

Expand All @@ -363,8 +387,8 @@ def _compute_inclusion_score(self, row: pd.Series) -> bool:
Returns:
bool: True if the target is included in the output_text, False otherwise.
"""
s1 = _clean_text(row[self.target_key])
s2 = _clean_text(row[self.generation_key])
s1 = self.clean_text_method(row[self.target_key])
s2 = self.clean_text_method(row[self.generation_key])
return s1 in s2

def get_compute_longest_common_substring_map(
Expand Down Expand Up @@ -415,11 +439,11 @@ def _compute_longest_common_substring_map(

target_set = row[self.target_set_key]

comparison_text = _clean_text(row[comparison_key])
fp_text = _clean_text(row[false_positive_key])
comparison_text = self.clean_text_method(row[comparison_key])
fp_text = self.clean_text_method(row[false_positive_key])

for target in target_set:
clean_target = _clean_text(target)
clean_target = self.clean_text_method(target)

if lcs_bound_config is not None:
lcs = _char_level_longest_common_substring_helper_bound(
Expand Down Expand Up @@ -527,7 +551,9 @@ def run_analysis(self) -> TextInclusionAnalysisNodeOutput:
self._compute_edit_similarity, axis=1
)
generation_df["edit_similarity_score"] = _normalize_by_target_len(
generation_df["edit_similarity"], generation_df["target"]
generation_df["edit_similarity"],
generation_df["target"],
self.clean_text_method,
)

outputs.edit_similarity = generation_df["edit_similarity"]
Expand Down
86 changes: 86 additions & 0 deletions privacy_guard/analysis/tests/test_text_inclusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
_char_level_longest_common_substring_helper_bound,
_char_level_longest_common_substring_with_matched_text,
_clean_text,
_clean_text_remove_consecutive_whitespace,
_normalize_by_target_len,
_word_level_longest_common_subsequence_helper,
TextInclusionAnalysisNode,
TextInclusionAnalysisNodeOutput,
Expand Down Expand Up @@ -522,3 +524,87 @@ def test_longest_common_susequence_match_autojunk(self) -> None:
_word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=True),
0,
)

def test_clean_text_remove_consecutive_whitespace(self) -> None:
# Test that consecutive whitespace is properly removed
text_with_consecutive_spaces = "hello world"
result = _clean_text_remove_consecutive_whitespace(text_with_consecutive_spaces)
self.assertEqual(result, "hello world")

# Test multiple consecutive spaces
text_with_many_spaces = "hello world test"
result = _clean_text_remove_consecutive_whitespace(text_with_many_spaces)
self.assertEqual(result, "hello world test")

# Test tabs and newlines are also collapsed
text_with_mixed_whitespace = "hello\t\t\nworld"
result = _clean_text_remove_consecutive_whitespace(text_with_mixed_whitespace)
self.assertEqual(result, "hello world")

# Test leading/trailing whitespace is stripped
text_with_leading_trailing = " hello world "
result = _clean_text_remove_consecutive_whitespace(text_with_leading_trailing)
self.assertEqual(result, "hello world")

def test_normalize_by_target_len_respects_clean_method(self) -> None:
# Test that _normalize_by_target_len uses the provided clean_text_method
targets = pd.Series(["hello world", "test text"]) # consecutive spaces
scores = pd.Series([10.0, 8.0])

# With _clean_text, consecutive spaces are preserved
# "hello world" -> "hello world" (12 chars after cleaning)
result_basic = _normalize_by_target_len(scores, targets, _clean_text)
# len("hello world") = 12
self.assertAlmostEqual(result_basic.iloc[0], 10.0 / 12.0)

# With _clean_text_remove_consecutive_whitespace, consecutive spaces are removed
# "hello world" -> "hello world" (11 chars after cleaning)
result_no_consecutive = _normalize_by_target_len(
scores, targets, _clean_text_remove_consecutive_whitespace
)
# len("hello world") = 11
self.assertAlmostEqual(result_no_consecutive.iloc[0], 10.0 / 11.0)

# Verify the two methods produce different results
self.assertNotAlmostEqual(result_basic.iloc[0], result_no_consecutive.iloc[0])

def test_analysis_with_remove_consecutive_whitespace(self) -> None:
# Test that TextInclusionAnalysisNode respects remove_consecutive_whitespace config
data = {
"prompt": ["test prompt"], # consecutive spaces
"target": ["target text"], # consecutive spaces
"output_text": ["target text"], # no consecutive spaces
}

# Without remove_consecutive_whitespace
analysis_input_basic = TextInclusionAnalysisInput(
generation_df=pd.DataFrame(data),
remove_consecutive_whitespace=False,
disable_longest_common_substring=True,
disable_word_level_longest_common_subsequence=True,
disable_char_level_longest_common_subsequence=True,
)
analysis_node_basic = TextInclusionAnalysisNode(
analysis_input=analysis_input_basic
)
results_basic = analysis_node_basic.compute_outputs()

# With remove_consecutive_whitespace
analysis_input_cleaned = TextInclusionAnalysisInput(
generation_df=pd.DataFrame(data),
remove_consecutive_whitespace=True,
disable_longest_common_substring=True,
disable_word_level_longest_common_subsequence=True,
disable_char_level_longest_common_subsequence=True,
)
analysis_node_cleaned = TextInclusionAnalysisNode(
analysis_input=analysis_input_cleaned
)
results_cleaned = analysis_node_cleaned.compute_outputs()

# edit_similarity_score should be different because target length differs
# when consecutive whitespace is removed vs preserved
self.assertNotAlmostEqual(
results_basic["edit_similarity_score"].iloc[0],
results_cleaned["edit_similarity_score"].iloc[0],
)
Loading