1
1
import json
2
2
import logging
3
- from logging import INFO
4
3
from typing import List
5
4
6
5
from .config import OperationType
10
9
from .services .text_service import TextService
11
10
12
11
logger = logging .getLogger ("datafog_logger" )
13
- logger .setLevel (INFO )
12
+ logger .setLevel (logging . INFO )
14
13
15
14
16
15
class DataFog :
@@ -37,7 +36,7 @@ def __init__(
37
36
self .logger .info (f"Operations: { operations } " )
38
37
39
38
async def run_ocr_pipeline (self , image_urls : List [str ]):
40
- """Run the OCR pipeline asynchronously."""
39
+ """Run the OCR pipeline asynchronously on a list of images provided via url ."""
41
40
try :
42
41
extracted_text = await self .image_service .ocr_extract (image_urls )
43
42
self .logger .info (f"OCR extraction completed for { len (image_urls )} images." )
@@ -46,7 +45,7 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
46
45
)
47
46
48
47
if OperationType .ANNOTATE_PII in self .operations :
49
- annotated_text = await self .text_service .batch_annotate_texts (
48
+ annotated_text = await self .text_service .batch_annotate_text_async (
50
49
extracted_text
51
50
)
52
51
self .logger .info (
@@ -59,55 +58,45 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
59
58
self .logger .error (f"Error in run_ocr_pipeline: { str (e )} " )
60
59
raise
61
60
62
- async def run_text_pipeline (self , texts : List [str ]):
63
- """Run the text pipeline asynchronously."""
61
+ async def run_text_pipeline (self , str_list : List [str ]):
62
+ """Run the text pipeline asynchronously on a list of input text ."""
64
63
try :
65
- self .logger .info (f"Starting text pipeline with { len (texts )} texts." )
64
+ self .logger .info (f"Starting text pipeline with { len (str_list )} texts." )
66
65
if OperationType .ANNOTATE_PII in self .operations :
67
- annotated_text = await self .text_service .batch_annotate_texts (texts )
66
+ annotated_text = await self .text_service .batch_annotate_text_async (
67
+ str_list
68
+ )
68
69
self .logger .info (
69
70
f"Text annotation completed with { len (annotated_text )} annotations."
70
71
)
71
72
return annotated_text
72
73
73
74
self .logger .info ("No annotation operation found; returning original texts." )
74
- return texts
75
+ return str_list
75
76
except Exception as e :
76
77
self .logger .error (f"Error in run_text_pipeline: { str (e )} " )
77
78
raise
78
79
79
- def _add_attributes (self , attributes : dict ):
80
- """Add multiple attributes."""
81
- for key , value in attributes .items ():
82
- pass
83
-
84
-
85
- class OCRPIIAnnotator :
86
- def __init__ (self ):
87
- self .image_service = ImageService (use_donut = True , use_tesseract = False )
88
- self .text_annotator = SpacyPIIAnnotator .create ()
89
- self .spark_service : SparkService = None
90
-
91
- async def run (self , image_urls : List [str ], output_path = None ):
80
+ def run_text_pipeline_sync (self , str_list : List [str ]):
81
+ """Run the text pipeline synchronously on a list of input text."""
92
82
try :
93
- # Download and process the image to extract text
94
- # downloaded_images = await self.image_service.download_images(image_urls)
95
- # extracted_texts = await self.image_service.ocr_extract(downloaded_images)
96
-
97
- # # Annotate the extracted text for PII
98
- # annotated_texts = [self.text_annotator.annotate(text) for text in extracted_texts]
99
-
100
- # # Optionally, output the results to a JSON file
101
- # if output_path:
102
- # with open(output_path, "w") as f:
103
- # json.dump(annotated_texts, f)
83
+ self .logger .info (f"Starting text pipeline with { len (str_list )} texts." )
84
+ if OperationType .ANNOTATE_PII in self .operations :
85
+ annotated_text = self .text_service .batch_annotate_text_sync (str_list )
86
+ self .logger .info (
87
+ f"Text annotation completed with { len (annotated_text )} annotations."
88
+ )
89
+ return annotated_text
104
90
105
- # return annotated_texts
106
- pass
91
+ self .logger .info ("No annotation operation found; returning original texts." )
92
+ return str_list
93
+ except Exception as e :
94
+ self .logger .error (f"Error in run_text_pipeline: { str (e )} " )
95
+ raise
107
96
108
- finally :
109
- # Ensure Spark resources are released
110
- # self.spark_processor.spark.stop()
97
+ def _add_attributes ( self , attributes : dict ) :
98
+ """Add multiple attributes."""
99
+ for key , value in attributes . items ():
111
100
pass
112
101
113
102
0 commit comments