DataFog · sidmohan0 · May 3, 2025 · May 3, 2025
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -2,81 +2,81 @@ name: Performance Benchmarks
 
 on:
   push:
-    branches: [ main, develop ]
+    branches: [main, develop]
   pull_request:
-    branches: [ main, develop ]
+    branches: [main, develop]
   # Schedule benchmarks to run weekly
   schedule:
-    - cron: '0 0 * * 0'  # Run at midnight on Sundays
+    - cron: "0 0 * * 0" # Run at midnight on Sundays
 
 jobs:
   benchmark:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-      with:
-        fetch-depth: 0  # Fetch all history for proper comparison
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 # Fetch all history for proper comparison
 
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.10'
-        cache: 'pip'
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -e .
-        pip install -r requirements-dev.txt
-        pip install pytest-benchmark
-
-    - name: Restore benchmark data
-      uses: actions/cache@v3
-      with:
-        path: .benchmarks
-        key: benchmark-${{ runner.os }}-${{ hashFiles('**/requirements*.txt') }}
-        restore-keys: |
-          benchmark-${{ runner.os }}-
-
-    - name: Run benchmarks and save baseline
-      run: |
-        # Run benchmarks and save results
-        pytest tests/benchmark_text_service.py -v --benchmark-autosave
-
-    - name: Check for performance regression
-      run: |
-        # Compare against the previous benchmark if available
-        # Fail if performance degrades by more than 10%
-        if [ -d ".benchmarks" ]; then
-          BASELINE=$(ls -t .benchmarks/Linux-CPython-3.10-64bit | head -n 2 | tail -n 1)
-          CURRENT=$(ls -t .benchmarks/Linux-CPython-3.10-64bit | head -n 1)
-          if [ -n "$BASELINE" ] && [ "$BASELINE" != "$CURRENT" ]; then
-            # Set full paths to the benchmark files
-            BASELINE_FILE="$benchmark_dir/$BASELINE"
-            CURRENT_FILE="$benchmark_dir/$CURRENT"
-
-            echo "Comparing current run ($CURRENT) against baseline ($BASELINE)"
-            # First just show the comparison
-            pytest tests/benchmark_text_service.py --benchmark-compare
-
-            # Then check for significant regressions
-            echo "Checking for performance regressions (>10% slower)..."
-            # Use our Python script for benchmark comparison
-            python scripts/compare_benchmarks.py "$BASELINE_FILE" "$CURRENT_FILE"
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+          pip install -r requirements-dev.txt
+          pip install pytest-benchmark
+
+      - name: Restore benchmark data
+        uses: actions/cache@v3
+        with:
+          path: .benchmarks
+          key: benchmark-${{ runner.os }}-${{ hashFiles('**/requirements*.txt') }}
+          restore-keys: |
+            benchmark-${{ runner.os }}-
+
+      - name: Run benchmarks and save baseline
+        run: |
+          # Run benchmarks and save results
+          pytest tests/benchmark_text_service.py -v --benchmark-autosave
+
+      - name: Check for performance regression
+        run: |
+          # Compare against the previous benchmark if available
+          # Fail if performance degrades by more than 10%
+          if [ -d ".benchmarks" ]; then
+            BASELINE=$(ls -t .benchmarks/Linux-CPython-3.10-64bit | head -n 2 | tail -n 1)
+            CURRENT=$(ls -t .benchmarks/Linux-CPython-3.10-64bit | head -n 1)
+            if [ -n "$BASELINE" ] && [ "$BASELINE" != "$CURRENT" ]; then
+              # Set full paths to the benchmark files
+              BASELINE_FILE="$benchmark_dir/$BASELINE"
+              CURRENT_FILE="$benchmark_dir/$CURRENT"
+
+              echo "Comparing current run ($CURRENT) against baseline ($BASELINE)"
+              # First just show the comparison
+              pytest tests/benchmark_text_service.py --benchmark-compare
+
+              # Then check for significant regressions
+              echo "Checking for performance regressions (>10% slower)..."
+              # Use our Python script for benchmark comparison
+              python scripts/compare_benchmarks.py "$BASELINE_FILE" "$CURRENT_FILE"
+            else
+              echo "No previous benchmark found for comparison or only one benchmark exists"
+            fi
           else
-            echo "No previous benchmark found for comparison or only one benchmark exists"
+            echo "No benchmarks directory found"
           fi
-        else
-          echo "No benchmarks directory found"
-        fi
-
-    - name: Upload benchmark results
-      uses: actions/upload-artifact@v3
-      with:
-        name: benchmark-results
-        path: .benchmarks/
-
-    - name: Alert on regression
-      if: failure()
-      run: |
-        echo "::warning::Performance regression detected! Check benchmark results."
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v3
+        with:
+          name: benchmark-results
+          path: .benchmarks/
+
+      - name: Alert on regression
+        if: failure()
+        run: |
+          echo "::warning::Performance regression detected! Check benchmark results."
diff --git a/README.md b/README.md
@@ -346,13 +346,14 @@ auto_service = TextService()  # engine="auto" is the default
 
 Benchmark tests show that the regex engine is significantly faster than spaCy for PII detection:
 
-| Engine | Processing Time (10KB text) | Entities Detected |
-|--------|------------------------------|-------------------|
+| Engine | Processing Time (10KB text) | Entities Detected                                    |
+| ------ | --------------------------- | ---------------------------------------------------- |
 | Regex  | ~0.004 seconds              | EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP |
-| SpaCy  | ~0.48 seconds               | PERSON, ORG, GPE, CARDINAL, FAC |
-| Auto   | ~0.004 seconds              | Same as regex when patterns are found |
+| SpaCy  | ~0.48 seconds               | PERSON, ORG, GPE, CARDINAL, FAC                      |
+| Auto   | ~0.004 seconds              | Same as regex when patterns are found                |
 
 **Key findings:**
+
 - The regex engine is approximately **123x faster** than spaCy for processing the same text
 - The auto engine provides the best balance between speed and comprehensiveness
   - Uses fast regex patterns first

diff --git a/notes/story-1.3-tkt.md b/notes/story-1.3-tkt.md
@@ -1,5 +1,3 @@
-
-
 ## ✅ **Story 1.3 – Integrate Regex Annotator into `TextService`**
 
 > **Goal:** Allow `TextService` to support a pluggable engine via `engine="regex" | "spacy" | "auto"`.  
@@ -8,6 +6,7 @@
 ---
 
 ### 📂 0. **Preconditions**
+
 - [ ] Confirm `RegexAnnotator` is implemented and returns both:
   - `Dict[str, List[str]]` for legacy compatibility
   - `AnnotationResult` for structured output
@@ -18,6 +17,7 @@
 ### 🔨 1. Add `engine` Parameter to `TextService`
 
 #### Code:
+
 ```python
 class TextService:
     def __init__(self, engine: str = "auto", ...):
@@ -33,6 +33,7 @@ class TextService:
 Add branching logic to support all three modes.
 
 #### Pseudocode:
+
 ```python
 def annotate(self, text: str, structured: bool = False):
     if self.engine == "regex":
@@ -51,28 +52,32 @@ def annotate(self, text: str, structured: bool = False):
 ### 🧪 3. Write Integration Tests
 
 #### 3.1 Happy Path (Regex Only)
+
 - [ ] `test_engine_regex_detects_simple_entities()`  
-  Inputs: email, phone  
-  Asserts: `TextService(engine="regex").annotate(text)` returns expected dict
+       Inputs: email, phone  
+       Asserts: `TextService(engine="regex").annotate(text)` returns expected dict
 
 #### 3.2 Fallback (Auto → SpaCy)
+
 - [ ] `test_engine_auto_fallbacks_to_spacy()`  
-  Inputs: Named entities or tricky patterns regex misses  
-  Asserts: spaCy is invoked if regex finds nothing
+       Inputs: Named entities or tricky patterns regex misses  
+       Asserts: spaCy is invoked if regex finds nothing
 
 #### 3.3 Explicit SpaCy
+
 - [ ] `test_engine_spacy_only()`  
-  Asserts: spaCy is always used regardless of regex hits
+       Asserts: spaCy is always used regardless of regex hits
 
 #### 3.4 Structured Return
+
 - [ ] `test_structured_annotation_output()`  
-  Asserts: `structured=True` returns list of `Span` objects with label/start/end/text
+       Asserts: `structured=True` returns list of `Span` objects with label/start/end/text
 
 ---
 
 ### 📏 4. Performance Budget (Optional But Valuable)
 
-- [ ] Add benchmarking test to compare `regex` vs `spacy` on a 10 KB text  
+- [ ] Add benchmarking test to compare `regex` vs `spacy` on a 10 KB text
 - [ ] Log and confirm regex is ≥5× faster than spaCy in most scenarios
 
 ---
@@ -84,4 +89,3 @@ def annotate(self, text: str, structured: bool = False):
 - [ ] Add a comment near the `auto` logic explaining fallback threshold
 
 ---
-