Merge pull request #32 from DataFog/synchronous_processing

sidmohan0 · web-flow · commit 4389855ec801 · 2024-07-13T22:22:11.000-07:00
Add Synchronous processing
diff --git a/datafog/__about__.py b/datafog/__about__.py
@@ -1 +1 @@
-__version__ = "3.2.2"
+__version__ = "3.3.0"
diff --git a/datafog/__init__.py b/datafog/__init__.py
@@ -1,6 +1,6 @@
 from .__about__ import __version__
 from .config import OperationType
-from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
+from .main import DataFog, TextPIIAnnotator
 from .processing.image_processing.donut_processor import DonutProcessor
 from .processing.image_processing.image_downloader import ImageDownloader
 from .processing.image_processing.pytesseract_processor import PytesseractProcessor
@@ -13,7 +13,6 @@
     "DonutProcessor",
     "DataFog",
     "ImageService",
-    "OCRPIIAnnotator",
     "OperationType",
     "SparkService",
     "TextPIIAnnotator",
diff --git a/datafog/main.py b/datafog/main.py
@@ -1,6 +1,5 @@
 import json
 import logging
-from logging import INFO
 from typing import List
 
 from .config import OperationType
@@ -10,7 +9,7 @@
 from .services.text_service import TextService
 
 logger = logging.getLogger("datafog_logger")
-logger.setLevel(INFO)
+logger.setLevel(logging.INFO)
 
 
 class DataFog:
@@ -37,7 +36,7 @@ def __init__(
         self.logger.info(f"Operations: {operations}")
 
     async def run_ocr_pipeline(self, image_urls: List[str]):
-        """Run the OCR pipeline asynchronously."""
+        """Run the OCR pipeline asynchronously on a list of images provided via url."""
         try:
             extracted_text = await self.image_service.ocr_extract(image_urls)
             self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
@@ -46,7 +45,7 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
             )
 
             if OperationType.ANNOTATE_PII in self.operations:
-                annotated_text = await self.text_service.batch_annotate_texts(
+                annotated_text = await self.text_service.batch_annotate_text_async(
                     extracted_text
                 )
                 self.logger.info(
@@ -59,55 +58,45 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
             self.logger.error(f"Error in run_ocr_pipeline: {str(e)}")
             raise
 
-    async def run_text_pipeline(self, texts: List[str]):
-        """Run the text pipeline asynchronously."""
+    async def run_text_pipeline(self, str_list: List[str]):
+        """Run the text pipeline asynchronously on a list of input text."""
         try:
-            self.logger.info(f"Starting text pipeline with {len(texts)} texts.")
+            self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
             if OperationType.ANNOTATE_PII in self.operations:
-                annotated_text = await self.text_service.batch_annotate_texts(texts)
+                annotated_text = await self.text_service.batch_annotate_text_async(
+                    str_list
+                )
                 self.logger.info(
                     f"Text annotation completed with {len(annotated_text)} annotations."
                 )
                 return annotated_text
 
             self.logger.info("No annotation operation found; returning original texts.")
-            return texts
+            return str_list
         except Exception as e:
             self.logger.error(f"Error in run_text_pipeline: {str(e)}")
             raise
 
-    def _add_attributes(self, attributes: dict):
-        """Add multiple attributes."""
-        for key, value in attributes.items():
-            pass
-
-
-class OCRPIIAnnotator:
-    def __init__(self):
-        self.image_service = ImageService(use_donut=True, use_tesseract=False)
-        self.text_annotator = SpacyPIIAnnotator.create()
-        self.spark_service: SparkService = None
-
-    async def run(self, image_urls: List[str], output_path=None):
+    def run_text_pipeline_sync(self, str_list: List[str]):
+        """Run the text pipeline synchronously on a list of input text."""
         try:
-            # Download and process the image to extract text
-            # downloaded_images = await self.image_service.download_images(image_urls)
-            # extracted_texts = await self.image_service.ocr_extract(downloaded_images)
-
-            # # Annotate the extracted text for PII
-            # annotated_texts = [self.text_annotator.annotate(text) for text in extracted_texts]
-
-            # # Optionally, output the results to a JSON file
-            # if output_path:
-            #     with open(output_path, "w") as f:
-            #         json.dump(annotated_texts, f)
+            self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
+            if OperationType.ANNOTATE_PII in self.operations:
+                annotated_text = self.text_service.batch_annotate_text_sync(str_list)
+                self.logger.info(
+                    f"Text annotation completed with {len(annotated_text)} annotations."
+                )
+                return annotated_text
 
-            # return annotated_texts
-            pass
+            self.logger.info("No annotation operation found; returning original texts.")
+            return str_list
+        except Exception as e:
+            self.logger.error(f"Error in run_text_pipeline: {str(e)}")
+            raise
 
-        finally:
-            # Ensure Spark resources are released
-            # self.spark_processor.spark.stop()
+    def _add_attributes(self, attributes: dict):
+        """Add multiple attributes."""
+        for key, value in attributes.items():
             pass
 
 
diff --git a/datafog/services/text_service.py b/datafog/services/text_service.py
@@ -7,12 +7,24 @@ class TextService:
     def __init__(self):
         self.annotator = SpacyPIIAnnotator.create()
 
-    async def annotate_text(self, text):
-        """Asynchronously annotate a single piece of text."""
+    def annotate_text_sync(self, text):
+        """Synchronously Annotate a text string."""
+        print(f"Starting on {text.split()[0]}")
+        res = self.annotator.annotate(text)
+        print(f"Done processing {text.split()[0]}")
+        return res
+
+    def batch_annotate_text_sync(self, texts: list):
+        """Synchronously annotate a list of text input."""
+        results = [self.annotate_text_sync(text) for text in texts]
+        return dict(zip(texts, results, strict=True))
+
+    async def annotate_text_async(self, text):
+        """Asynchronously annotate a text string."""
         return await asyncio.to_thread(self.annotator.annotate, text)
 
-    async def batch_annotate_texts(self, texts: list):
-        """Asynchronously annotate a batch of texts."""
-        tasks = [self.annotate_text(text) for text in texts]
+    async def batch_annotate_text_async(self, text: list):
+        """Asynchronously annotate a list of text input."""
+        tasks = [self.annotate_text_async(txt) for txt in text]
         results = await asyncio.gather(*tasks)
-        return dict(zip(texts, results, strict=True))
+        return dict(zip(text, results, strict=True))
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     long_description = f.read()
 
 # Use a single source of truth for the version
-__version__ = "3.2.2"
+__version__ = "3.3.0"
 
 project_urls = {
     "Homepage": "https://datafog.ai",
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -36,6 +36,21 @@ def test_textpii_annotator():
 #     assert "Satya Nadella" in annotated_text[0].get("PER", []), "PII not annotated correctly."
 
 
+def test_datafog_text_annotation_sync():
+    """Test DataFog class for synchronous text annotation."""
+    text = ["Joe Biden is the President of the United States."]
+    datafog = DataFog()
+    annotated_text = datafog.run_text_pipeline_sync(text)
+
+    assert annotated_text  # Ensure that some results are returned.
+    assert search_nested_dict(
+        annotated_text, "Joe Biden"
+    ), "Joe Biden not found in annotated results."
+    assert search_nested_dict(
+        annotated_text, "the United States"
+    ), "United States not found in annotated results."
+
+
 @pytest.mark.asyncio
 async def test_datafog_text_annotation():
     """Test DataFog class for text annotation."""

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.2.2"`
	`1`	`+__version__ = "3.3.0"`