Skip to content

Commit 1285ab3

Browse files
authored
Merge pull request #65 from DataFog/feat/regex-fallback-improvements
feat(regex): Enhance regex patterns and tests for PII detection
2 parents 648fae5 + efcd6b6 commit 1285ab3

File tree

2 files changed

+302
-270
lines changed

2 files changed

+302
-270
lines changed

datafog/processing/text_processing/regex_annotator/regex_annotator.py

Lines changed: 96 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,45 +34,123 @@ def __init__(self):
3434
# Compile all patterns once at initialization
3535
self.patterns: Dict[str, Pattern] = {
3636
# Email pattern - RFC 5322 subset
37+
# Intentionally permissive to favor false positives over false negatives
3738
# Allows for multiple dots, special characters in local part, and subdomains
38-
# The pattern is intentionally permissive to favor false positives over false negatives
39+
# Note: This is broader than the spec to catch more potential emails
3940
"EMAIL": re.compile(
40-
r"[\w!#$%&\'*+\-/=?^_`{|}~.]+@[\w\-.]+\.[\w\-.]+",
41-
re.IGNORECASE | re.MULTILINE,
41+
r"""
42+
[\w!#$%&'*+\-/=?^_`{|}~.]+ # Local part with special chars allowed
43+
@ # @ symbol
44+
[\w\-.]+ # Domain name with possible dots
45+
\.[\w\-.]+ # TLD with at least one dot
46+
""",
47+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
4248
),
43-
# Phone pattern - NANP (North American Numbering Plan) format
44-
# Accepts formats like: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
49+
# Phone pattern - North American Numbering Plan (NANP) format
50+
# Accepts formats: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
51+
# Note: Allows for various separators (dash, dot, space) and optional country code
4552
"PHONE": re.compile(
46-
r"(?:(?:\+|)1[-\.\s]?)?\(?\d{3}\)?[-\.\s]?\d{3}[-\.\s]?\d{4}",
47-
re.IGNORECASE | re.MULTILINE,
53+
r"""
54+
(?:(?:\+|)1[-\.\s]?)? # Optional country code (+1 or 1)
55+
\(?\d{3}\)? # Area code, optionally in parentheses
56+
[-\.\s]? # Optional separator after area code
57+
\d{3} # Exchange code
58+
[-\.\s]? # Optional separator after exchange code
59+
\d{4} # Subscriber number
60+
""",
61+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
4862
),
4963
# SSN pattern - U.S. Social Security Number
5064
# Format: XXX-XX-XXXX where XXX != 000, 666
65+
# Note: Uses negative lookahead to reject invalid prefixes
5166
"SSN": re.compile(
52-
r"\b(?!000|666)\d{3}-\d{2}-\d{4}\b", re.IGNORECASE | re.MULTILINE
67+
r"""
68+
\b # Word boundary
69+
(?!000|666) # Reject 000 and 666 prefixes
70+
\d{3} # First 3 digits
71+
- # Hyphen separator
72+
\d{2} # Middle 2 digits
73+
- # Hyphen separator
74+
\d{4} # Last 4 digits
75+
\b # Word boundary
76+
""",
77+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
5378
),
5479
# Credit card pattern - Visa, Mastercard, and American Express
5580
# Visa: 16 digits, starts with 4
5681
# Mastercard: 16 digits, starts with 51-55
57-
# American Express: 15 digits, starts with 34 or 37
82+
# American Express: 15 digits, starts with 34 or 37 (EXACTLY 15 digits)
83+
# Note: Handles both continuous digit formats and formats with separators
5884
"CREDIT_CARD": re.compile(
59-
r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b",
60-
re.IGNORECASE | re.MULTILINE,
85+
r"""
86+
\b
87+
(?:
88+
4\d{12}(?:\d{3})? # Visa (16 digits, starts with 4)
89+
|
90+
5[1-5]\d{14} # Mastercard (16 digits, starts with 51-55)
91+
|
92+
3[47]\d{13}$ # Amex (EXACTLY 15 digits, starts with 34 or 37)
93+
|
94+
(?: # Formatted versions with separators
95+
(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2}) # Card prefix
96+
[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4} # Rest of card with separators
97+
)
98+
|
99+
(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}) # Amex with separators
100+
)
101+
\b
102+
""",
103+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
61104
),
62105
# IP Address pattern - IPv4 and IPv6
63106
# IPv4: 4 octets of numbers 0-255 separated by dots
64107
# IPv6: 8 groups of 1-4 hex digits separated by colons, with possible compression
108+
# Note: Validates IPv4 octets to be in valid range (0-255)
65109
"IP_ADDRESS": re.compile(
66-
r"(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b|\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b|\b(?:[A-Fa-f0-9]{1,4}:){0,6}(?::[A-Fa-f0-9]{1,4}){1,6}\b|\b(?:[A-Fa-f0-9]{1,4}:){1,7}:\b)",
67-
re.IGNORECASE | re.MULTILINE,
110+
r"""
111+
(?:
112+
# IPv4 address pattern
113+
\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b
114+
|
115+
# Simple IPv6 pattern that matches all valid formats including compressed ones
116+
\b(?:[0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\b
117+
)
118+
""",
119+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
68120
),
69121
# Date of Birth pattern - supports MM/DD/YYYY, M/D/YYYY, MM-DD-YYYY, and YYYY-MM-DD formats
70-
# Validates that month is 01-12 and day is 01-31 for MM/DD/YYYY format
122+
# Note: Validates that month is 01-12 and day is 01-31
71123
"DOB": re.compile(
72-
r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12][0-9]|3[01]))\b",
73-
re.IGNORECASE | re.MULTILINE,
124+
r"""
125+
\b
126+
(?:
127+
(?:0?[1-9]|1[0-2]) # Month: 1-12
128+
[/-] # Separator (/ or -)
129+
(?:0?[1-9]|[12][0-9]|3[01]) # Day: 1-31
130+
[/-] # Separator (/ or -)
131+
(?:\d{2}|\d{4}) # Year: 2 or 4 digits
132+
|
133+
(?:\d{4}) # Year: 4 digits (ISO format)
134+
- # Separator (-)
135+
(?:0?[1-9]|1[0-2]) # Month: 1-12
136+
- # Separator (-)
137+
(?:0?[1-9]|[12][0-9]|3[01]) # Day: 1-31
138+
)
139+
\b
140+
""",
141+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
142+
),
143+
# ZIP code pattern - US ZIP / ZIP+4
144+
# Note: Supports both 5-digit ZIP and ZIP+4 format
145+
"ZIP": re.compile(
146+
r"""
147+
\b
148+
\d{5} # 5-digit ZIP code
149+
(?:-\d{4})? # Optional +4 extension
150+
\b
151+
""",
152+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
74153
),
75-
"ZIP": re.compile(r"\b\d{5}(?:-\d{4})?\b", re.IGNORECASE | re.MULTILINE),
76154
}
77155

78156
@classmethod
@@ -129,7 +207,7 @@ def annotate_with_spans(
129207
end=match.end(),
130208
text=match.group(),
131209
)
132-
spans_by_label.setdefault(label, []).append(span)
210+
spans_by_label[label].append(span)
133211
all_spans.append(span)
134212

135213
regex_result = {

0 commit comments

Comments
 (0)