@@ -34,45 +34,123 @@ def __init__(self):
34
34
# Compile all patterns once at initialization
35
35
self .patterns : Dict [str , Pattern ] = {
36
36
# Email pattern - RFC 5322 subset
37
+ # Intentionally permissive to favor false positives over false negatives
37
38
# Allows for multiple dots, special characters in local part, and subdomains
38
- # The pattern is intentionally permissive to favor false positives over false negatives
39
+ # Note: This is broader than the spec to catch more potential emails
39
40
"EMAIL" : re .compile (
40
- r"[\w!#$%&\'*+\-/=?^_`{|}~.]+@[\w\-.]+\.[\w\-.]+" ,
41
- re .IGNORECASE | re .MULTILINE ,
41
+ r"""
42
+ [\w!#$%&'*+\-/=?^_`{|}~.]+ # Local part with special chars allowed
43
+ @ # @ symbol
44
+ [\w\-.]+ # Domain name with possible dots
45
+ \.[\w\-.]+ # TLD with at least one dot
46
+ """ ,
47
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
42
48
),
43
- # Phone pattern - NANP (North American Numbering Plan) format
44
- # Accepts formats like: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
49
+ # Phone pattern - North American Numbering Plan (NANP) format
50
+ # Accepts formats: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
51
+ # Note: Allows for various separators (dash, dot, space) and optional country code
45
52
"PHONE" : re .compile (
46
- r"(?:(?:\+|)1[-\.\s]?)?\(?\d{3}\)?[-\.\s]?\d{3}[-\.\s]?\d{4}" ,
47
- re .IGNORECASE | re .MULTILINE ,
53
+ r"""
54
+ (?:(?:\+|)1[-\.\s]?)? # Optional country code (+1 or 1)
55
+ \(?\d{3}\)? # Area code, optionally in parentheses
56
+ [-\.\s]? # Optional separator after area code
57
+ \d{3} # Exchange code
58
+ [-\.\s]? # Optional separator after exchange code
59
+ \d{4} # Subscriber number
60
+ """ ,
61
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
48
62
),
49
63
# SSN pattern - U.S. Social Security Number
50
64
# Format: XXX-XX-XXXX where XXX != 000, 666
65
+ # Note: Uses negative lookahead to reject invalid prefixes
51
66
"SSN" : re .compile (
52
- r"\b(?!000|666)\d{3}-\d{2}-\d{4}\b" , re .IGNORECASE | re .MULTILINE
67
+ r"""
68
+ \b # Word boundary
69
+ (?!000|666) # Reject 000 and 666 prefixes
70
+ \d{3} # First 3 digits
71
+ - # Hyphen separator
72
+ \d{2} # Middle 2 digits
73
+ - # Hyphen separator
74
+ \d{4} # Last 4 digits
75
+ \b # Word boundary
76
+ """ ,
77
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
53
78
),
54
79
# Credit card pattern - Visa, Mastercard, and American Express
55
80
# Visa: 16 digits, starts with 4
56
81
# Mastercard: 16 digits, starts with 51-55
57
- # American Express: 15 digits, starts with 34 or 37
82
+ # American Express: 15 digits, starts with 34 or 37 (EXACTLY 15 digits)
83
+ # Note: Handles both continuous digit formats and formats with separators
58
84
"CREDIT_CARD" : re .compile (
59
- r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b" ,
60
- re .IGNORECASE | re .MULTILINE ,
85
+ r"""
86
+ \b
87
+ (?:
88
+ 4\d{12}(?:\d{3})? # Visa (16 digits, starts with 4)
89
+ |
90
+ 5[1-5]\d{14} # Mastercard (16 digits, starts with 51-55)
91
+ |
92
+ 3[47]\d{13}$ # Amex (EXACTLY 15 digits, starts with 34 or 37)
93
+ |
94
+ (?: # Formatted versions with separators
95
+ (?:4\d{3}|5[1-5]\d{2}|3[47]\d{2}) # Card prefix
96
+ [-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4} # Rest of card with separators
97
+ )
98
+ |
99
+ (?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}) # Amex with separators
100
+ )
101
+ \b
102
+ """ ,
103
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
61
104
),
62
105
# IP Address pattern - IPv4 and IPv6
63
106
# IPv4: 4 octets of numbers 0-255 separated by dots
64
107
# IPv6: 8 groups of 1-4 hex digits separated by colons, with possible compression
108
+ # Note: Validates IPv4 octets to be in valid range (0-255)
65
109
"IP_ADDRESS" : re .compile (
66
- r"(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b|\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b|\b(?:[A-Fa-f0-9]{1,4}:){0,6}(?::[A-Fa-f0-9]{1,4}){1,6}\b|\b(?:[A-Fa-f0-9]{1,4}:){1,7}:\b)" ,
67
- re .IGNORECASE | re .MULTILINE ,
110
+ r"""
111
+ (?:
112
+ # IPv4 address pattern
113
+ \b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b
114
+ |
115
+ # Simple IPv6 pattern that matches all valid formats including compressed ones
116
+ \b(?:[0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\b
117
+ )
118
+ """ ,
119
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
68
120
),
69
121
# Date of Birth pattern - supports MM/DD/YYYY, M/D/YYYY, MM-DD-YYYY, and YYYY-MM-DD formats
70
- # Validates that month is 01-12 and day is 01-31 for MM/DD/YYYY format
122
+ # Note: Validates that month is 01-12 and day is 01-31
71
123
"DOB" : re .compile (
72
- r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12][0-9]|3[01]))\b" ,
73
- re .IGNORECASE | re .MULTILINE ,
124
+ r"""
125
+ \b
126
+ (?:
127
+ (?:0?[1-9]|1[0-2]) # Month: 1-12
128
+ [/-] # Separator (/ or -)
129
+ (?:0?[1-9]|[12][0-9]|3[01]) # Day: 1-31
130
+ [/-] # Separator (/ or -)
131
+ (?:\d{2}|\d{4}) # Year: 2 or 4 digits
132
+ |
133
+ (?:\d{4}) # Year: 4 digits (ISO format)
134
+ - # Separator (-)
135
+ (?:0?[1-9]|1[0-2]) # Month: 1-12
136
+ - # Separator (-)
137
+ (?:0?[1-9]|[12][0-9]|3[01]) # Day: 1-31
138
+ )
139
+ \b
140
+ """ ,
141
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
142
+ ),
143
+ # ZIP code pattern - US ZIP / ZIP+4
144
+ # Note: Supports both 5-digit ZIP and ZIP+4 format
145
+ "ZIP" : re .compile (
146
+ r"""
147
+ \b
148
+ \d{5} # 5-digit ZIP code
149
+ (?:-\d{4})? # Optional +4 extension
150
+ \b
151
+ """ ,
152
+ re .IGNORECASE | re .MULTILINE | re .VERBOSE ,
74
153
),
75
- "ZIP" : re .compile (r"\b\d{5}(?:-\d{4})?\b" , re .IGNORECASE | re .MULTILINE ),
76
154
}
77
155
78
156
@classmethod
@@ -129,7 +207,7 @@ def annotate_with_spans(
129
207
end = match .end (),
130
208
text = match .group (),
131
209
)
132
- spans_by_label . setdefault ( label , []) .append (span )
210
+ spans_by_label [ label ] .append (span )
133
211
all_spans .append (span )
134
212
135
213
regex_result = {
0 commit comments