story 1.2

sidmohan0 · sidmohan0 · commit 648fae51b095 · 2025-05-01T20:07:54.000-07:00
diff --git a/datafog/processing/text_processing/regex_annotator/__init__.py b/datafog/processing/text_processing/regex_annotator/__init__.py
@@ -0,0 +1,7 @@
+from datafog.processing.text_processing.regex_annotator.regex_annotator import (
+    AnnotationResult,
+    RegexAnnotator,
+    Span,
+)
+
+__all__ = ["RegexAnnotator", "Span", "AnnotationResult"]
diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py
@@ -0,0 +1,139 @@
+import re
+from typing import Dict, List, Pattern, Tuple
+
+from pydantic import BaseModel
+
+
+class Span(BaseModel):
+    """Represents a span of text with a label and character offsets."""
+
+    label: str  # "EMAIL"
+    start: int  # char offset
+    end: int  # char offset
+    text: str  # The actual text content
+
+
+class AnnotationResult(BaseModel):
+    """Structured model for annotation results."""
+
+    text: str  # The input text
+    spans: List[Span]  # List of spans found in the text
+
+
+class RegexAnnotator:
+    """Annotator that uses regular expressions to identify PII entities in text.
+
+    This annotator serves as a fallback to the SpaCy annotator and is optimized for
+    performance, targeting ≤ 20 µs / kB on a MacBook M-series.
+    """
+
+    # Labels for PII entities
+    LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"]
+
+    def __init__(self):
+        # Compile all patterns once at initialization
+        self.patterns: Dict[str, Pattern] = {
+            # Email pattern - RFC 5322 subset
+            # Allows for multiple dots, special characters in local part, and subdomains
+            # The pattern is intentionally permissive to favor false positives over false negatives
+            "EMAIL": re.compile(
+                r"[\w!#$%&\'*+\-/=?^_`{|}~.]+@[\w\-.]+\.[\w\-.]+",
+                re.IGNORECASE | re.MULTILINE,
+            ),
+            # Phone pattern - NANP (North American Numbering Plan) format
+            # Accepts formats like: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555
+            "PHONE": re.compile(
+                r"(?:(?:\+|)1[-\.\s]?)?\(?\d{3}\)?[-\.\s]?\d{3}[-\.\s]?\d{4}",
+                re.IGNORECASE | re.MULTILINE,
+            ),
+            # SSN pattern - U.S. Social Security Number
+            # Format: XXX-XX-XXXX where XXX != 000, 666
+            "SSN": re.compile(
+                r"\b(?!000|666)\d{3}-\d{2}-\d{4}\b", re.IGNORECASE | re.MULTILINE
+            ),
+            # Credit card pattern - Visa, Mastercard, and American Express
+            # Visa: 16 digits, starts with 4
+            # Mastercard: 16 digits, starts with 51-55
+            # American Express: 15 digits, starts with 34 or 37
+            "CREDIT_CARD": re.compile(
+                r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b",
+                re.IGNORECASE | re.MULTILINE,
+            ),
+            # IP Address pattern - IPv4 and IPv6
+            # IPv4: 4 octets of numbers 0-255 separated by dots
+            # IPv6: 8 groups of 1-4 hex digits separated by colons, with possible compression
+            "IP_ADDRESS": re.compile(
+                r"(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b|\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b|\b(?:[A-Fa-f0-9]{1,4}:){0,6}(?::[A-Fa-f0-9]{1,4}){1,6}\b|\b(?:[A-Fa-f0-9]{1,4}:){1,7}:\b)",
+                re.IGNORECASE | re.MULTILINE,
+            ),
+            # Date of Birth pattern - supports MM/DD/YYYY, M/D/YYYY, MM-DD-YYYY, and YYYY-MM-DD formats
+            # Validates that month is 01-12 and day is 01-31 for MM/DD/YYYY format
+            "DOB": re.compile(
+                r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12][0-9]|3[01]))\b",
+                re.IGNORECASE | re.MULTILINE,
+            ),
+            "ZIP": re.compile(r"\b\d{5}(?:-\d{4})?\b", re.IGNORECASE | re.MULTILINE),
+        }
+
+    @classmethod
+    def create(cls) -> "RegexAnnotator":
+        """Factory method to create a new RegexAnnotator instance."""
+        return cls()
+
+    def annotate(self, text: str) -> Dict[str, List[str]]:
+        """Annotate text with PII entities using regex patterns.
+
+        Args:
+            text: The input text to annotate
+
+        Returns:
+            A dictionary mapping entity labels to lists of matched strings
+        """
+        result = {label: [] for label in self.LABELS}
+
+        # Return empty result for empty text
+        if not text:
+            return result
+
+        # Process with each pattern
+        for label, pattern in self.patterns.items():
+            for match in pattern.finditer(text):
+                result[label].append(match.group())
+
+        return result
+
+    def annotate_with_spans(
+        self, text: str
+    ) -> Tuple[Dict[str, List[str]], AnnotationResult]:
+        """Annotate text and return both dict format and structured format.
+
+        Args:
+            text: The input text to annotate
+
+        Returns:
+            A tuple containing:
+            - A dictionary mapping entity labels to lists of matched strings
+            - An AnnotationResult object with structured span information
+        """
+        spans_by_label = {label: [] for label in self.LABELS}
+        all_spans = []
+
+        if not text:
+            return spans_by_label, AnnotationResult(text=text, spans=all_spans)
+
+        for label, pattern in self.patterns.items():
+            for match in pattern.finditer(text):
+                span = Span(
+                    label=label,
+                    start=match.start(),
+                    end=match.end(),
+                    text=match.group(),
+                )
+                spans_by_label.setdefault(label, []).append(span)
+                all_spans.append(span)
+
+        regex_result = {
+            lbl: [s.text for s in spans_by_label[lbl]] for lbl in spans_by_label
+        }
+
+        return regex_result, AnnotationResult(text=text, spans=all_spans)
diff --git a/notes/story-1.1-prd.md b/notes/story-1.1-prd.md
@@ -0,0 +1,89 @@
+<html><head></head><body><h3>Story 1.1 </h3>
+<hr>
+<h2>1. Entity menu (MVP for 4.1)</h2>
+
+| Label       | Scope                               | Regex sketch                                       | Notes                                                                                                                      |
+| ----------- | ----------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
+| EMAIL       | RFC 5322 subset                     | [\w.+-]+@[\w-]+\.[\w.-]{2,}                        | Good enough for 99 % of mail; avoids huge RFC monsters. (Regex validation of email addresses according to RFC5321/RFC5322) |
+| PHONE       | NANP 10-digit                       | (?:\+1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4} | Accepts 555-555-5555, (555) 555-5555, +1 555 555 5555. (Regular expression to match standard 10 digit phone number)        |
+| SSN         | U.S. social security                | \b\d{3}-\d{2}-\d{4}\b                              | Rejects “000-” starts & “666”. (Add later if needed.)                                                                      |
+| CREDIT_CARD | Visa/Mastercard/AmEx                | `\b(?:4\d{12}(?:\d{3})?                            | 5[1-5]\d{14}                                                                                                               |
+| IP_ADDRESS  | IPv4 + v6                           | `(?:\b\d{1,3}(?:.\d{1,3}){3}\b                     | (?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4})`                                                                                 |
+| DOB         | Dates like MM/DD/YYYY or YYYY-MM-DD | `\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}                | \d{4}-\d{2}-\d{2})\b`                                                                                                      |
+| ZIP         | US ZIP / ZIP+4                      | \b\d{5}(?:-\d{4})?\b                               | Locale-specific; extend with postcodes later.                                                                              |
+
+<p><em>All patterns compiled with <code inline="">re.IGNORECASE | re.MULTILINE</code> and wrapped in <code inline="">r''</code> raw strings.</em></p>
+<hr>
+<h2>2. Return-value schema</h2>
+<h3>2.1 Keep the <em>dict-of-lists</em> for backward compatibility</h3>
+<pre><code class="language-python">from typing import Dict, List
+
+Annotation = Dict[str, List[str]]
+
+# e.g. {"EMAIL": ["[email protected]"], "PHONE": ["555-555-5555"]}
+
+</code></pre>
+
+<h3>2.2 Offer an optional structured model (new but additive)</h3>
+<pre><code class="language-python">from pydantic import BaseModel
+from typing import List
+
+class Span(BaseModel):
+label: str # "EMAIL"
+start: int # char offset
+end: int # char offset
+text: str
+
+class AnnotationResult(BaseModel):
+text: str
+spans: List[Span]
+</code></pre>
+
+<p><em>Why both?</em> Existing users don’t break; new users get richer data. The regex annotator returns <strong>both</strong>:</p>
+<pre><code class="language-python">regex_result = {lbl: [s.text for s in spans_by_label[lbl]] for lbl in spans_by_label}
+return regex_result, AnnotationResult(text=input_text, spans=all_spans)
+</code></pre>
+<p><code inline="">TextService</code> will pick whichever format the caller asked for.</p>
+<hr>
+<h2>3. Performance budget</h2>
+<ul>
+<li>
+<p>Target ≤ 20 µs / kB on a MacBook M-series at -O.</p>
+</li>
+<li>
+<p>Compile all patterns once at module import.</p>
+</li>
+<li>
+<p>Run <code inline="">re.finditer</code> for each pattern, append spans; no pandas, no multiprocessing.</p>
+</li>
+</ul>
+<hr>
+<h2>4. Edge-case policy</h2>
+<ul>
+<li>
+<p><strong>False positives &gt; false negatives</strong> for v 1: easier to redact extra than miss PII.</p>
+</li>
+<li>
+<p>No validation (e.g., Luhn) in 4.1.0; add later under a <code inline="">validate=True</code> flag.</p>
+</li>
+<li>
+<p>Reject obviously invalid IPv4 octets (<code inline="">25[6-9]</code>, <code inline="">3\d{2}</code>) to keep noise down.</p>
+</li>
+</ul>
+<hr>
+<h2>5. Acceptance checklist (feeds Story 1.4 baseline)</h2>
+<ul class="contains-task-list">
+<li class="task-list-item">
+<p><input type="checkbox" disabled=""> All patterns compile.</p>
+</li>
+<li class="task-list-item">
+<p><input type="checkbox" disabled=""> Unit tests pass on happy-path and tricky strings (<code inline="">foo@[123.456.789.000]</code> should fail).</p>
+</li>
+<li class="task-list-item">
+<p><input type="checkbox" disabled=""> Benchmarks show regex path at least <strong>5× faster</strong> than spaCy on 10 kB sample.</p>
+</li>
+<li class="task-list-item">
+<p><input type="checkbox" disabled=""> Output dict keys exactly match label names above.</p>
+</li>
+</ul>
+</body></html>
diff --git a/notes/story-1.2-tkt.md b/notes/story-1.2-tkt.md
@@ -0,0 +1,81 @@
+### TDD Plan for Story 1.2: _Design the regex fallback spec_
+
+#### 1. **Setup Testing Environment**
+
+- [ ] Create a new test module (e.g., `test_regex_annotator.py`)
+- [ ] Import `pytest` and set up fixtures if needed
+
+#### 2. **Write Failing Tests First**
+
+##### 2.1 Entity Patterns (regex)
+
+For each label below, write a unit test with:
+
+- One clearly matching string
+- One edge-case false negative
+- One false positive to avoid
+
+- [ ] `test_email_regex()`
+- [ ] `test_phone_regex()`
+- [ ] `test_ssn_regex()`
+- [ ] `test_credit_card_regex()`
+- [ ] `test_ip_address_regex()`
+- [ ] `test_dob_regex()`
+- [ ] `test_zip_regex()`
+
+##### 2.2 Return Schema
+
+- [ ] `test_annotation_dict_format()`  
+       Assert that a sample input returns `Dict[str, List[str]]` with correct keys and values.
+
+- [ ] `test_annotation_result_format()`  
+       Assert that the structured `AnnotationResult` returns correct spans with offsets and labels.
+
+##### 2.3 Performance Constraint
+
+- [ ] `test_regex_performance()`  
+       Benchmark annotation on a 10 KB input and assert runtime < 200 µs.
+
+##### 2.4 Edge Case Policy
+
+- [ ] `test_invalid_ip_filtered()`  
+       Ensure IPs like `999.999.999.999` or `256.1.1.1` are skipped.
+
+- [ ] `test_loose_date_acceptance()`  
+       Accept both `01/01/2000` and `2000-01-01`.
+
+- [ ] `test_tricky_email_rejection()`  
+       Reject `foo@[123.456.789.000]`.
+
+##### 2.5 Contract Compliance
+
+- [ ] `test_output_keys_match_labels()`  
+       Ensure output dict keys are exactly: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DOB`, `ZIP`.
+
+---
+
+#### 3. **Stub Out Regex Annotator**
+
+- [ ] Create a skeleton module: `regex_annotator.py`
+- [ ] Define pattern table (label → compiled regex)
+- [ ] Define `Span` and `AnnotationResult` classes
+- [ ] Stub `annotate(text: str)` to return fixed values
+
+---
+
+#### 4. **Iteratively Implement Logic**
+
+- [ ] Implement each regex and rerun tests until each corresponding test passes.
+- [ ] Implement span extraction logic using `re.finditer`.
+- [ ] Implement both `dict` and `structured` output formats.
+- [ ] Optimize for performance — compile all patterns once, run in sequence.
+
+---
+
+#### 5. **Refactor**
+
+- [ ] Group tests using parameterization where possible
+- [ ] Add fixtures for shared input text
+- [ ] Split long regex into readable multiline strings (with `re.VERBOSE` if needed)
+
+---
diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py