Skip to content

python 3.10, 3.11, 3.12 support | model #51

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions .github/workflows/dev-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,24 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.10", "3.11", "3.12"]
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Set up Python
uses: actions/setup-python@v4
with:
Expand All @@ -54,11 +68,15 @@ jobs:
pip install -e .
pip install tox just pre-commit
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests
name: codecov-umbrella
- name: Clean up pip cache
run: |
pip cache purge
rm -rf ~/.cache/pip
23 changes: 20 additions & 3 deletions .github/workflows/feature-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ jobs:
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Set up Python
uses: actions/setup-python@v4
with:
Expand All @@ -51,10 +65,13 @@ jobs:
- name: Install Dependencies
run: |
pip install -U pip
pip install -e .
pip install tox just pre-commit
pip install --no-cache-dir -e .
pip install --no-cache-dir tox just pre-commit
- name: Free up disk space
run: |
sudo apt-get clean
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/main-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
pip install -e .
pip install tox just pre-commit
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ For local development:
```
5. Install the package in editable mode:
```
pip install -e .
pip install -r requirements-dev.txt
```
6. Set up the project:
```
Expand Down
26 changes: 22 additions & 4 deletions datafog/processing/image_processing/donut_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from io import BytesIO

import numpy as np
import requests
from PIL import Image

Expand All @@ -13,7 +14,6 @@

class DonutProcessor:
def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):

self.ensure_installed("torch")
self.ensure_installed("transformers")

Expand All @@ -36,13 +36,31 @@
[sys.executable, "-m", "pip", "install", package_name]
)

async def parse_image(self, image: Image) -> str:
def preprocess_image(self, image: Image.Image) -> np.ndarray:
# Convert to RGB if the image is not already in RGB mode
if image.mode != "RGB":
image = image.convert("RGB")

# Convert to numpy array
image_np = np.array(image)

# Ensure the image is 3D (height, width, channels)
if image_np.ndim == 2:
image_np = np.expand_dims(image_np, axis=-1)
image_np = np.repeat(image_np, 3, axis=-1)

Check warning on line 50 in datafog/processing/image_processing/donut_processor.py

View check run for this annotation

Codecov / codecov/patch

datafog/processing/image_processing/donut_processor.py#L49-L50

Added lines #L49 - L50 were not covered by tests

return image_np

async def parse_image(self, image: Image.Image) -> str:
"""Process w/ DonutProcessor and VisionEncoderDecoderModel"""
# Preprocess the image
image_np = self.preprocess_image(image)

task_prompt = "<s_cord-v2>"
decoder_input_ids = self.processor.tokenizer(
task_prompt, add_special_tokens=False, return_tensors="pt"
).input_ids
pixel_values = self.processor(image, return_tensors="pt").pixel_values
pixel_values = self.processor(images=image_np, return_tensors="pt").pixel_values

outputs = self.model.generate(
pixel_values.to(self.device),
Expand Down Expand Up @@ -71,7 +89,7 @@
image = self.downloader.download_image(url)
return self.parse_image(image)

def download_image(self, url: str) -> Image:
def download_image(self, url: str) -> Image.Image:
"""Download an image from URL."""
response = requests.get(url)
image = Image.open(BytesIO(response.content))
Expand Down
4 changes: 2 additions & 2 deletions datafog/processing/spark_processing/pyspark_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
"""Extract features using en_spacy_pii_fast model.
"""Extract features using en_core_web_lg model.

Returns:
list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
Expand Down Expand Up @@ -40,7 +40,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:


def broadcast_pii_annotator_udf(
spark_session=None, spacy_model: str = "en_spacy_pii_fast"
spark_session=None, spacy_model: str = "en_core_web_lg"
):
"""Broadcast PII annotator across Spark cluster and create UDF"""
ensure_installed("pyspark")
Expand Down
55 changes: 41 additions & 14 deletions datafog/processing/text_processing/spacy_pii_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,26 @@

from pydantic import BaseModel

PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
PII_ANNOTATION_LABELS = [
"CARDINAL",
"DATE",
"EVENT",
"FAC",
"GPE",
"LANGUAGE",
"LAW",
"LOC",
"MONEY",
"NORP",
"ORDINAL",
"ORG",
"PERCENT",
"PERSON",
"PRODUCT",
"QUANTITY",
"TIME",
"WORK_OF_ART",
]
MAXIMAL_STRING_SIZE = 1000000


Expand All @@ -12,21 +31,29 @@ class SpacyPIIAnnotator(BaseModel):

@classmethod
def create(cls) -> "SpacyPIIAnnotator":
try:
# Try loading as a spaCy model first
import spacy
import spacy

nlp = spacy.load("en_spacy_pii_fast")
try:
nlp = spacy.load("en_core_web_lg")
except OSError:
# If that fails, try importing as a module
try:
import en_spacy_pii_fast

nlp = en_spacy_pii_fast.load()
except ImportError:
raise ImportError(
"Failed to load en_spacy_pii_fast. Make sure it's installed correctly."
)
import subprocess
import sys

interpreter_location = sys.executable
subprocess.run(
[
interpreter_location,
"-m",
"pip",
"install",
"--no-deps",
"--no-cache-dir",
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
],
check=True,
)
nlp = spacy.load("en_core_web_lg")

return cls(nlp=nlp)

def annotate(self, text: str) -> Dict[str, List[str]]:
Expand Down
27 changes: 25 additions & 2 deletions datafog/services/image_service.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
import asyncio
import io
import ssl
from typing import List

import aiohttp
import certifi
from PIL import Image

from datafog.processing.image_processing.donut_processor import DonutProcessor
from datafog.processing.image_processing.image_downloader import ImageDownloader
from datafog.processing.image_processing.pytesseract_processor import (
PytesseractProcessor,
)


class ImageDownloader:
async def download_image(self, url: str) -> Image.Image:
ssl_context = ssl.create_default_context(cafile=certifi.where())
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=ssl_context)
) as session:
async with session.get(url) as response:
if response.status == 200:
image_data = await response.read()
return Image.open(io.BytesIO(image_data))
else:
raise Exception(

Check warning on line 27 in datafog/services/image_service.py

View check run for this annotation

Codecov / codecov/patch

datafog/services/image_service.py#L27

Added line #L27 was not covered by tests
f"Failed to download image. Status code: {response.status}"
)


class ImageService:
def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
self.downloader = ImageDownloader()
Expand All @@ -21,7 +40,11 @@
)

async def download_images(self, urls: List[str]) -> List[Image.Image]:
return await self.downloader.download_images(urls)
async def download_image(url: str) -> Image.Image:
return await self.downloader.download_image(url)

tasks = [asyncio.create_task(download_image(url)) for url in urls]
return await asyncio.gather(*tasks, return_exceptions=True)

async def ocr_extract(
self,
Expand Down
4 changes: 3 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ just
isort
black
blacken-docs
certifi
flake8
prettier
tox
pytest
pytest==7.4.0
pytest-asyncio==0.21.0
pytest-cov
mypy
autoflake
Expand Down
Loading
Loading