|
| 1 | +import re |
| 2 | +from typing import Dict, List, Pattern, Tuple |
| 3 | + |
| 4 | +from pydantic import BaseModel |
| 5 | + |
| 6 | + |
| 7 | +class Span(BaseModel): |
| 8 | + """Represents a span of text with a label and character offsets.""" |
| 9 | + |
| 10 | + label: str # "EMAIL" |
| 11 | + start: int # char offset |
| 12 | + end: int # char offset |
| 13 | + text: str # The actual text content |
| 14 | + |
| 15 | + |
| 16 | +class AnnotationResult(BaseModel): |
| 17 | + """Structured model for annotation results.""" |
| 18 | + |
| 19 | + text: str # The input text |
| 20 | + spans: List[Span] # List of spans found in the text |
| 21 | + |
| 22 | + |
| 23 | +class RegexAnnotator: |
| 24 | + """Annotator that uses regular expressions to identify PII entities in text. |
| 25 | +
|
| 26 | + This annotator serves as a fallback to the SpaCy annotator and is optimized for |
| 27 | + performance, targeting ≤ 20 µs / kB on a MacBook M-series. |
| 28 | + """ |
| 29 | + |
| 30 | + # Labels for PII entities |
| 31 | + LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] |
| 32 | + |
| 33 | + def __init__(self): |
| 34 | + # Compile all patterns once at initialization |
| 35 | + self.patterns: Dict[str, Pattern] = { |
| 36 | + # Email pattern - RFC 5322 subset |
| 37 | + # Allows for multiple dots, special characters in local part, and subdomains |
| 38 | + # The pattern is intentionally permissive to favor false positives over false negatives |
| 39 | + "EMAIL": re.compile( |
| 40 | + r"[\w!#$%&\'*+\-/=?^_`{|}~.]+@[\w\-.]+\.[\w\-.]+", |
| 41 | + re.IGNORECASE | re.MULTILINE, |
| 42 | + ), |
| 43 | + # Phone pattern - NANP (North American Numbering Plan) format |
| 44 | + # Accepts formats like: 555-555-5555, (555) 555-5555, +1 555 555 5555, 1-555-555-5555 |
| 45 | + "PHONE": re.compile( |
| 46 | + r"(?:(?:\+|)1[-\.\s]?)?\(?\d{3}\)?[-\.\s]?\d{3}[-\.\s]?\d{4}", |
| 47 | + re.IGNORECASE | re.MULTILINE, |
| 48 | + ), |
| 49 | + # SSN pattern - U.S. Social Security Number |
| 50 | + # Format: XXX-XX-XXXX where XXX != 000, 666 |
| 51 | + "SSN": re.compile( |
| 52 | + r"\b(?!000|666)\d{3}-\d{2}-\d{4}\b", re.IGNORECASE | re.MULTILINE |
| 53 | + ), |
| 54 | + # Credit card pattern - Visa, Mastercard, and American Express |
| 55 | + # Visa: 16 digits, starts with 4 |
| 56 | + # Mastercard: 16 digits, starts with 51-55 |
| 57 | + # American Express: 15 digits, starts with 34 or 37 |
| 58 | + "CREDIT_CARD": re.compile( |
| 59 | + r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b", |
| 60 | + re.IGNORECASE | re.MULTILINE, |
| 61 | + ), |
| 62 | + # IP Address pattern - IPv4 and IPv6 |
| 63 | + # IPv4: 4 octets of numbers 0-255 separated by dots |
| 64 | + # IPv6: 8 groups of 1-4 hex digits separated by colons, with possible compression |
| 65 | + "IP_ADDRESS": re.compile( |
| 66 | + r"(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b|\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b|\b(?:[A-Fa-f0-9]{1,4}:){0,6}(?::[A-Fa-f0-9]{1,4}){1,6}\b|\b(?:[A-Fa-f0-9]{1,4}:){1,7}:\b)", |
| 67 | + re.IGNORECASE | re.MULTILINE, |
| 68 | + ), |
| 69 | + # Date of Birth pattern - supports MM/DD/YYYY, M/D/YYYY, MM-DD-YYYY, and YYYY-MM-DD formats |
| 70 | + # Validates that month is 01-12 and day is 01-31 for MM/DD/YYYY format |
| 71 | + "DOB": re.compile( |
| 72 | + r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12][0-9]|3[01]))\b", |
| 73 | + re.IGNORECASE | re.MULTILINE, |
| 74 | + ), |
| 75 | + "ZIP": re.compile(r"\b\d{5}(?:-\d{4})?\b", re.IGNORECASE | re.MULTILINE), |
| 76 | + } |
| 77 | + |
| 78 | + @classmethod |
| 79 | + def create(cls) -> "RegexAnnotator": |
| 80 | + """Factory method to create a new RegexAnnotator instance.""" |
| 81 | + return cls() |
| 82 | + |
| 83 | + def annotate(self, text: str) -> Dict[str, List[str]]: |
| 84 | + """Annotate text with PII entities using regex patterns. |
| 85 | +
|
| 86 | + Args: |
| 87 | + text: The input text to annotate |
| 88 | +
|
| 89 | + Returns: |
| 90 | + A dictionary mapping entity labels to lists of matched strings |
| 91 | + """ |
| 92 | + result = {label: [] for label in self.LABELS} |
| 93 | + |
| 94 | + # Return empty result for empty text |
| 95 | + if not text: |
| 96 | + return result |
| 97 | + |
| 98 | + # Process with each pattern |
| 99 | + for label, pattern in self.patterns.items(): |
| 100 | + for match in pattern.finditer(text): |
| 101 | + result[label].append(match.group()) |
| 102 | + |
| 103 | + return result |
| 104 | + |
| 105 | + def annotate_with_spans( |
| 106 | + self, text: str |
| 107 | + ) -> Tuple[Dict[str, List[str]], AnnotationResult]: |
| 108 | + """Annotate text and return both dict format and structured format. |
| 109 | +
|
| 110 | + Args: |
| 111 | + text: The input text to annotate |
| 112 | +
|
| 113 | + Returns: |
| 114 | + A tuple containing: |
| 115 | + - A dictionary mapping entity labels to lists of matched strings |
| 116 | + - An AnnotationResult object with structured span information |
| 117 | + """ |
| 118 | + spans_by_label = {label: [] for label in self.LABELS} |
| 119 | + all_spans = [] |
| 120 | + |
| 121 | + if not text: |
| 122 | + return spans_by_label, AnnotationResult(text=text, spans=all_spans) |
| 123 | + |
| 124 | + for label, pattern in self.patterns.items(): |
| 125 | + for match in pattern.finditer(text): |
| 126 | + span = Span( |
| 127 | + label=label, |
| 128 | + start=match.start(), |
| 129 | + end=match.end(), |
| 130 | + text=match.group(), |
| 131 | + ) |
| 132 | + spans_by_label.setdefault(label, []).append(span) |
| 133 | + all_spans.append(span) |
| 134 | + |
| 135 | + regex_result = { |
| 136 | + lbl: [s.text for s in spans_by_label[lbl]] for lbl in spans_by_label |
| 137 | + } |
| 138 | + |
| 139 | + return regex_result, AnnotationResult(text=text, spans=all_spans) |
0 commit comments