Skip to content

Commit 648fae5

Browse files
committed
story 1.2
1 parent ef44a86 commit 648fae5

File tree

5 files changed

+723
-0
lines changed

5 files changed

+723
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from datafog.processing.text_processing.regex_annotator.regex_annotator import (
2+
AnnotationResult,
3+
RegexAnnotator,
4+
Span,
5+
)
6+
7+
__all__ = ["RegexAnnotator", "Span", "AnnotationResult"]
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import re
2+
from typing import Dict, List, Pattern, Tuple
3+
4+
from pydantic import BaseModel
5+
6+
7+
class Span(BaseModel):
8+
"""Represents a span of text with a label and character offsets."""
9+
10+
label: str # "EMAIL"
11+
start: int # char offset
12+
end: int # char offset
13+
text: str # The actual text content
14+
15+
16+
class AnnotationResult(BaseModel):
17+
"""Structured model for annotation results."""
18+
19+
text: str # The input text
20+
spans: List[Span] # List of spans found in the text
21+
22+
23+
class RegexAnnotator:
24+
"""Annotator that uses regular expressions to identify PII entities in text.
25+
26+
This annotator serves as a fallback to the SpaCy annotator and is optimized for
27+
performance, targeting ≤ 20 µs / kB on a MacBook M-series.
28+
"""
29+
30+
# Labels for PII entities
31+
LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"]
32+
33+
def __init__(self):
34+
# Compile all patterns once at initialization
35+
self.patterns: Dict[str, Pattern] = {
36+
# Email pattern - RFC 5322 subset
37+
# Allows for multiple dots, special characters in local part, and subdomains
38+
# The pattern is intentionally permissive to favor false positives over false negatives
39+
"EMAIL": re.compile(
40+
r"[\w!#$%&\'*+\-/=?^_`{|}~.]+@[\w\-.]+\.[\w\-.]+",
41+
re.IGNORECASE | re.MULTILINE,
42+
),
43+
# Phone pattern - NANP (North American Numbering Plan) format
44+
# Accepts formats like: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
45+
"PHONE": re.compile(
46+
r"(?:(?:\+|)1[-\.\s]?)?\(?\d{3}\)?[-\.\s]?\d{3}[-\.\s]?\d{4}",
47+
re.IGNORECASE | re.MULTILINE,
48+
),
49+
# SSN pattern - U.S. Social Security Number
50+
# Format: XXX-XX-XXXX where XXX != 000, 666
51+
"SSN": re.compile(
52+
r"\b(?!000|666)\d{3}-\d{2}-\d{4}\b", re.IGNORECASE | re.MULTILINE
53+
),
54+
# Credit card pattern - Visa, Mastercard, and American Express
55+
# Visa: 16 digits, starts with 4
56+
# Mastercard: 16 digits, starts with 51-55
57+
# American Express: 15 digits, starts with 34 or 37
58+
"CREDIT_CARD": re.compile(
59+
r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b",
60+
re.IGNORECASE | re.MULTILINE,
61+
),
62+
# IP Address pattern - IPv4 and IPv6
63+
# IPv4: 4 octets of numbers 0-255 separated by dots
64+
# IPv6: 8 groups of 1-4 hex digits separated by colons, with possible compression
65+
"IP_ADDRESS": re.compile(
66+
r"(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b|\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b|\b(?:[A-Fa-f0-9]{1,4}:){0,6}(?::[A-Fa-f0-9]{1,4}){1,6}\b|\b(?:[A-Fa-f0-9]{1,4}:){1,7}:\b)",
67+
re.IGNORECASE | re.MULTILINE,
68+
),
69+
# Date of Birth pattern - supports MM/DD/YYYY, M/D/YYYY, MM-DD-YYYY, and YYYY-MM-DD formats
70+
# Validates that month is 01-12 and day is 01-31 for MM/DD/YYYY format
71+
"DOB": re.compile(
72+
r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12][0-9]|3[01]))\b",
73+
re.IGNORECASE | re.MULTILINE,
74+
),
75+
"ZIP": re.compile(r"\b\d{5}(?:-\d{4})?\b", re.IGNORECASE | re.MULTILINE),
76+
}
77+
78+
@classmethod
79+
def create(cls) -> "RegexAnnotator":
80+
"""Factory method to create a new RegexAnnotator instance."""
81+
return cls()
82+
83+
def annotate(self, text: str) -> Dict[str, List[str]]:
84+
"""Annotate text with PII entities using regex patterns.
85+
86+
Args:
87+
text: The input text to annotate
88+
89+
Returns:
90+
A dictionary mapping entity labels to lists of matched strings
91+
"""
92+
result = {label: [] for label in self.LABELS}
93+
94+
# Return empty result for empty text
95+
if not text:
96+
return result
97+
98+
# Process with each pattern
99+
for label, pattern in self.patterns.items():
100+
for match in pattern.finditer(text):
101+
result[label].append(match.group())
102+
103+
return result
104+
105+
def annotate_with_spans(
106+
self, text: str
107+
) -> Tuple[Dict[str, List[str]], AnnotationResult]:
108+
"""Annotate text and return both dict format and structured format.
109+
110+
Args:
111+
text: The input text to annotate
112+
113+
Returns:
114+
A tuple containing:
115+
- A dictionary mapping entity labels to lists of matched strings
116+
- An AnnotationResult object with structured span information
117+
"""
118+
spans_by_label = {label: [] for label in self.LABELS}
119+
all_spans = []
120+
121+
if not text:
122+
return spans_by_label, AnnotationResult(text=text, spans=all_spans)
123+
124+
for label, pattern in self.patterns.items():
125+
for match in pattern.finditer(text):
126+
span = Span(
127+
label=label,
128+
start=match.start(),
129+
end=match.end(),
130+
text=match.group(),
131+
)
132+
spans_by_label.setdefault(label, []).append(span)
133+
all_spans.append(span)
134+
135+
regex_result = {
136+
lbl: [s.text for s in spans_by_label[lbl]] for lbl in spans_by_label
137+
}
138+
139+
return regex_result, AnnotationResult(text=text, spans=all_spans)

notes/story-1.1-prd.md

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
<html><head></head><body><h3>Story 1.1 </h3>
2+
<hr>
3+
<h2>1. Entity menu (MVP for 4.1)</h2>
4+
5+
| Label | Scope | Regex sketch | Notes |
6+
| ----------- | ----------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
7+
| EMAIL | RFC 5322 subset | [\w.+-]+@[\w-]+\.[\w.-]{2,} | Good enough for 99 % of mail; avoids huge RFC monsters. (Regex validation of email addresses according to RFC5321/RFC5322) |
8+
| PHONE | NANP 10-digit | (?:\+1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4} | Accepts 555-555-5555, (555) 555-5555, +1 555 555 5555. (Regular expression to match standard 10 digit phone number) |
9+
| SSN | U.S. social security | \b\d{3}-\d{2}-\d{4}\b | Rejects “000-” starts & “666”. (Add later if needed.) |
10+
| CREDIT_CARD | Visa/Mastercard/AmEx | `\b(?:4\d{12}(?:\d{3})? | 5[1-5]\d{14} |
11+
| IP_ADDRESS | IPv4 + v6 | `(?:\b\d{1,3}(?:.\d{1,3}){3}\b | (?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4})` |
12+
| DOB | Dates like MM/DD/YYYY or YYYY-MM-DD | `\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4} | \d{4}-\d{2}-\d{2})\b` |
13+
| ZIP | US ZIP / ZIP+4 | \b\d{5}(?:-\d{4})?\b | Locale-specific; extend with postcodes later. |
14+
15+
<p><em>All patterns compiled with <code inline="">re.IGNORECASE | re.MULTILINE</code> and wrapped in <code inline="">r''</code> raw strings.</em></p>
16+
<hr>
17+
<h2>2. Return-value schema</h2>
18+
<h3>2.1 Keep the <em>dict-of-lists</em> for backward compatibility</h3>
19+
<pre><code class="language-python">from typing import Dict, List
20+
21+
Annotation = Dict[str, List[str]]
22+
23+
# e.g. {"EMAIL": ["[email protected]"], "PHONE": ["555-555-5555"]}
24+
25+
</code></pre>
26+
27+
<h3>2.2 Offer an optional structured model (new but additive)</h3>
28+
<pre><code class="language-python">from pydantic import BaseModel
29+
from typing import List
30+
31+
class Span(BaseModel):
32+
label: str # "EMAIL"
33+
start: int # char offset
34+
end: int # char offset
35+
text: str
36+
37+
class AnnotationResult(BaseModel):
38+
text: str
39+
spans: List[Span]
40+
</code></pre>
41+
42+
<p><em>Why both?</em> Existing users don’t break; new users get richer data. The regex annotator returns <strong>both</strong>:</p>
43+
<pre><code class="language-python">regex_result = {lbl: [s.text for s in spans_by_label[lbl]] for lbl in spans_by_label}
44+
return regex_result, AnnotationResult(text=input_text, spans=all_spans)
45+
</code></pre>
46+
<p><code inline="">TextService</code> will pick whichever format the caller asked for.</p>
47+
<hr>
48+
<h2>3. Performance budget</h2>
49+
<ul>
50+
<li>
51+
<p>Target ≤ 20 µs / kB on a MacBook M-series at -O.</p>
52+
</li>
53+
<li>
54+
<p>Compile all patterns once at module import.</p>
55+
</li>
56+
<li>
57+
<p>Run <code inline="">re.finditer</code> for each pattern, append spans; no pandas, no multiprocessing.</p>
58+
</li>
59+
</ul>
60+
<hr>
61+
<h2>4. Edge-case policy</h2>
62+
<ul>
63+
<li>
64+
<p><strong>False positives &gt; false negatives</strong> for v 1: easier to redact extra than miss PII.</p>
65+
</li>
66+
<li>
67+
<p>No validation (e.g., Luhn) in 4.1.0; add later under a <code inline="">validate=True</code> flag.</p>
68+
</li>
69+
<li>
70+
<p>Reject obviously invalid IPv4 octets (<code inline="">25[6-9]</code>, <code inline="">3\d{2}</code>) to keep noise down.</p>
71+
</li>
72+
</ul>
73+
<hr>
74+
<h2>5. Acceptance checklist (feeds Story 1.4 baseline)</h2>
75+
<ul class="contains-task-list">
76+
<li class="task-list-item">
77+
<p><input type="checkbox" disabled=""> All patterns compile.</p>
78+
</li>
79+
<li class="task-list-item">
80+
<p><input type="checkbox" disabled=""> Unit tests pass on happy-path and tricky strings (<code inline="">foo@[123.456.789.000]</code> should fail).</p>
81+
</li>
82+
<li class="task-list-item">
83+
<p><input type="checkbox" disabled=""> Benchmarks show regex path at least <strong>5× faster</strong> than spaCy on 10 kB sample.</p>
84+
</li>
85+
<li class="task-list-item">
86+
<p><input type="checkbox" disabled=""> Output dict keys exactly match label names above.</p>
87+
</li>
88+
</ul>
89+
</body></html>

notes/story-1.2-tkt.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
### TDD Plan for Story 1.2: _Design the regex fallback spec_
2+
3+
#### 1. **Setup Testing Environment**
4+
5+
- [ ] Create a new test module (e.g., `test_regex_annotator.py`)
6+
- [ ] Import `pytest` and set up fixtures if needed
7+
8+
#### 2. **Write Failing Tests First**
9+
10+
##### 2.1 Entity Patterns (regex)
11+
12+
For each label below, write a unit test with:
13+
14+
- One clearly matching string
15+
- One edge-case false negative
16+
- One false positive to avoid
17+
18+
- [ ] `test_email_regex()`
19+
- [ ] `test_phone_regex()`
20+
- [ ] `test_ssn_regex()`
21+
- [ ] `test_credit_card_regex()`
22+
- [ ] `test_ip_address_regex()`
23+
- [ ] `test_dob_regex()`
24+
- [ ] `test_zip_regex()`
25+
26+
##### 2.2 Return Schema
27+
28+
- [ ] `test_annotation_dict_format()`
29+
Assert that a sample input returns `Dict[str, List[str]]` with correct keys and values.
30+
31+
- [ ] `test_annotation_result_format()`
32+
Assert that the structured `AnnotationResult` returns correct spans with offsets and labels.
33+
34+
##### 2.3 Performance Constraint
35+
36+
- [ ] `test_regex_performance()`
37+
Benchmark annotation on a 10 KB input and assert runtime < 200 µs.
38+
39+
##### 2.4 Edge Case Policy
40+
41+
- [ ] `test_invalid_ip_filtered()`
42+
Ensure IPs like `999.999.999.999` or `256.1.1.1` are skipped.
43+
44+
- [ ] `test_loose_date_acceptance()`
45+
Accept both `01/01/2000` and `2000-01-01`.
46+
47+
- [ ] `test_tricky_email_rejection()`
48+
Reject `foo@[123.456.789.000]`.
49+
50+
##### 2.5 Contract Compliance
51+
52+
- [ ] `test_output_keys_match_labels()`
53+
Ensure output dict keys are exactly: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DOB`, `ZIP`.
54+
55+
---
56+
57+
#### 3. **Stub Out Regex Annotator**
58+
59+
- [ ] Create a skeleton module: `regex_annotator.py`
60+
- [ ] Define pattern table (label → compiled regex)
61+
- [ ] Define `Span` and `AnnotationResult` classes
62+
- [ ] Stub `annotate(text: str)` to return fixed values
63+
64+
---
65+
66+
#### 4. **Iteratively Implement Logic**
67+
68+
- [ ] Implement each regex and rerun tests until each corresponding test passes.
69+
- [ ] Implement span extraction logic using `re.finditer`.
70+
- [ ] Implement both `dict` and `structured` output formats.
71+
- [ ] Optimize for performance — compile all patterns once, run in sequence.
72+
73+
---
74+
75+
#### 5. **Refactor**
76+
77+
- [ ] Group tests using parameterization where possible
78+
- [ ] Add fixtures for shared input text
79+
- [ ] Split long regex into readable multiline strings (with `re.VERBOSE` if needed)
80+
81+
---

0 commit comments

Comments
 (0)