DataFog · sidmohan0 · Aug 6, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024
diff --git a/.github/workflows/dev-cicd.yml b/.github/workflows/dev-cicd.yml
@@ -27,10 +27,24 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
       - name: Check out repo
         uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
@@ -54,11 +68,15 @@ jobs:
           pip install -e .
           pip install tox just pre-commit
       - name: Run Tests with tox
-        run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
+        run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
       - name: Submit to Codecov
         uses: codecov/codecov-action@v3
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           files: ./coverage.xml
           flags: unittests
           name: codecov-umbrella
+      - name: Clean up pip cache
+        run: |
+          pip cache purge
+          rm -rf ~/.cache/pip
diff --git a/.github/workflows/feature-cicd.yml b/.github/workflows/feature-cicd.yml
@@ -31,6 +31,20 @@ jobs:
     steps:
       - name: Check out repo
         uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
@@ -51,10 +65,13 @@ jobs:
       - name: Install Dependencies
         run: |
           pip install -U pip
-          pip install -e .
-          pip install tox just pre-commit
+          pip install --no-cache-dir -e .
+          pip install --no-cache-dir tox just pre-commit
+      - name: Free up disk space
+        run: |
+          sudo apt-get clean
       - name: Run Tests with tox
-        run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
+        run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
       - name: Submit to Codecov
         uses: codecov/codecov-action@v3
         with:

diff --git a/.github/workflows/main-cicd.yml b/.github/workflows/main-cicd.yml
@@ -54,7 +54,7 @@ jobs:
           pip install -e .
           pip install tox just pre-commit
       - name: Run Tests with tox
-        run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
+        run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
       - name: Submit to Codecov
         uses: codecov/codecov-action@v3
         with:

diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ For local development:
      ```
 5. Install the package in editable mode:
    ```
-   pip install -e .
+   pip install -r requirements-dev.txt
    ```
 6. Set up the project:
    ```

diff --git a/datafog/processing/image_processing/donut_processor.py b/datafog/processing/image_processing/donut_processor.py
@@ -5,6 +5,7 @@
 import sys
 from io import BytesIO
 
+import numpy as np
 import requests
 from PIL import Image
 
@@ -13,7 +14,6 @@
 
 class DonutProcessor:
     def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):
-
         self.ensure_installed("torch")
         self.ensure_installed("transformers")
 
@@ -36,13 +36,31 @@
                 [sys.executable, "-m", "pip", "install", package_name]
             )
 
-    async def parse_image(self, image: Image) -> str:
+    def preprocess_image(self, image: Image.Image) -> np.ndarray:
+        # Convert to RGB if the image is not already in RGB mode
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+
+        # Convert to numpy array
+        image_np = np.array(image)
+
+        # Ensure the image is 3D (height, width, channels)
+        if image_np.ndim == 2:
+            image_np = np.expand_dims(image_np, axis=-1)
+            image_np = np.repeat(image_np, 3, axis=-1)
+
+        return image_np
+
+    async def parse_image(self, image: Image.Image) -> str:
         """Process w/ DonutProcessor and VisionEncoderDecoderModel"""
+        # Preprocess the image
+        image_np = self.preprocess_image(image)
+
         task_prompt = "<s_cord-v2>"
         decoder_input_ids = self.processor.tokenizer(
             task_prompt, add_special_tokens=False, return_tensors="pt"
         ).input_ids
-        pixel_values = self.processor(image, return_tensors="pt").pixel_values
+        pixel_values = self.processor(images=image_np, return_tensors="pt").pixel_values
 
         outputs = self.model.generate(
             pixel_values.to(self.device),
@@ -71,7 +89,7 @@
         image = self.downloader.download_image(url)
         return self.parse_image(image)
 
-    def download_image(self, url: str) -> Image:
+    def download_image(self, url: str) -> Image.Image:
         """Download an image from URL."""
         response = requests.get(url)
         image = Image.open(BytesIO(response.content))

diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py
@@ -7,7 +7,7 @@
 
 
 def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
-    """Extract features using en_spacy_pii_fast model.
+    """Extract features using en_core_web_lg model.
 
     Returns:
         list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
@@ -40,7 +40,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
 
 
 def broadcast_pii_annotator_udf(
-    spark_session=None, spacy_model: str = "en_spacy_pii_fast"
+    spark_session=None, spacy_model: str = "en_core_web_lg"
 ):
     """Broadcast PII annotator across Spark cluster and create UDF"""
     ensure_installed("pyspark")

diff --git a/datafog/processing/text_processing/spacy_pii_annotator.py b/datafog/processing/text_processing/spacy_pii_annotator.py
@@ -3,7 +3,26 @@
 
 from pydantic import BaseModel
 
-PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
+PII_ANNOTATION_LABELS = [
+    "CARDINAL",
+    "DATE",
+    "EVENT",
+    "FAC",
+    "GPE",
+    "LANGUAGE",
+    "LAW",
+    "LOC",
+    "MONEY",
+    "NORP",
+    "ORDINAL",
+    "ORG",
+    "PERCENT",
+    "PERSON",
+    "PRODUCT",
+    "QUANTITY",
+    "TIME",
+    "WORK_OF_ART",
+]
 MAXIMAL_STRING_SIZE = 1000000
 
 
@@ -12,21 +31,29 @@ class SpacyPIIAnnotator(BaseModel):
 
     @classmethod
     def create(cls) -> "SpacyPIIAnnotator":
-        try:
-            # Try loading as a spaCy model first
-            import spacy
+        import spacy
 
-            nlp = spacy.load("en_spacy_pii_fast")
+        try:
+            nlp = spacy.load("en_core_web_lg")
         except OSError:
-            # If that fails, try importing as a module
-            try:
-                import en_spacy_pii_fast
-
-                nlp = en_spacy_pii_fast.load()
-            except ImportError:
-                raise ImportError(
-                    "Failed to load en_spacy_pii_fast. Make sure it's installed correctly."
-                )
+            import subprocess
+            import sys
+
+            interpreter_location = sys.executable
+            subprocess.run(
+                [
+                    interpreter_location,
+                    "-m",
+                    "pip",
+                    "install",
+                    "--no-deps",
+                    "--no-cache-dir",
+                    "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
+                ],
+                check=True,
+            )
+            nlp = spacy.load("en_core_web_lg")
+
         return cls(nlp=nlp)
 
     def annotate(self, text: str) -> Dict[str, List[str]]:

diff --git a/datafog/services/image_service.py b/datafog/services/image_service.py
@@ -1,15 +1,34 @@
 import asyncio
+import io
+import ssl
 from typing import List
 
+import aiohttp
+import certifi
 from PIL import Image
 
 from datafog.processing.image_processing.donut_processor import DonutProcessor
-from datafog.processing.image_processing.image_downloader import ImageDownloader
 from datafog.processing.image_processing.pytesseract_processor import (
     PytesseractProcessor,
 )
 
 
+class ImageDownloader:
+    async def download_image(self, url: str) -> Image.Image:
+        ssl_context = ssl.create_default_context(cafile=certifi.where())
+        async with aiohttp.ClientSession(
+            connector=aiohttp.TCPConnector(ssl=ssl_context)
+        ) as session:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    image_data = await response.read()
+                    return Image.open(io.BytesIO(image_data))
+                else:
+                    raise Exception(
+                        f"Failed to download image. Status code: {response.status}"
+                    )
+
+
 class ImageService:
     def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
         self.downloader = ImageDownloader()
@@ -21,7 +40,11 @@
         )
 
     async def download_images(self, urls: List[str]) -> List[Image.Image]:
-        return await self.downloader.download_images(urls)
+        async def download_image(url: str) -> Image.Image:
+            return await self.downloader.download_image(url)
+
+        tasks = [asyncio.create_task(download_image(url)) for url in urls]
+        return await asyncio.gather(*tasks, return_exceptions=True)
 
     async def ocr_extract(
         self,

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,10 +6,12 @@ just
 isort
 black
 blacken-docs
+certifi
 flake8
 prettier
 tox
-pytest
+pytest==7.4.0
+pytest-asyncio==0.21.0
 pytest-cov
 mypy
 autoflake