DataFog · sidmohan0 · May 28, 2024 · May 18, 2024 · May 18, 2024 · May 18, 2024
diff --git a/.env b/.env
@@ -0,0 +1,4 @@
+APPLICATIONINSIGHTS_CONNECTION_STRING="InstrumentationKey=00bea047-1836-46fa-9652-26d43d63a3fa;IngestionEndpoint=https://eastus-8.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=959cc365-c112-491b-af69-b196d0943ca4"
+
+
+# note this is an Azure specific implementation of the OpenTelemetry distro. for more information please see https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry
diff --git a/.github/workflows/dev-cicd-tests.yml b/.github/workflows/dev-cicd-tests.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10"]
     steps:
       - uses: actions/setup-python@v4
         with:
@@ -33,10 +33,16 @@ jobs:
           tox -- --cov datafog --cov-report xml --cov-report term
       - name: Submit to codecov
         uses: codecov/codecov-action@v3
-        if: ${{ matrix.python-version == '3.11' }}
+        if: ${{ matrix.python-version == '3.10' }}
 
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4.0.1
         env:
           token: ${{ secrets.CODECOV_TOKEN }}
           slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
diff --git a/.github/workflows/feature-ci-cd.yml b/.github/workflows/feature-ci-cd.yml
@@ -0,0 +1,48 @@
+name: feature-cicd-tests
+
+on:
+  push:
+    branches:
+      - feature/*
+  pull_request:
+    branches:
+      - feature/*
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Run pre-commit
+        uses: pre-commit/action@v3.0.1
+
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v3
+      - name: Test with tox
+        run: |
+          pip install tox
+          tox -- --cov datafog --cov-report xml --cov-report term
+      - name: Submit to codecov
+        uses: codecov/codecov-action@v3
+        if: ${{ matrix.python-version == '3.10' }}
+
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v4.0.1
+        env:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,5 @@ datafog-python/datafog/processing/text_processing/__pycache__/
 datafog-python/datafog/services/__pycache__/
 datafog-python/datafog/processing/__pycache__/
 datafog-python/datafog/__pycache__/
+.env
 
diff --git a/README.md b/README.md
@@ -39,7 +39,6 @@ DataFog can be installed via pip:
 pip install datafog
 ```
 
-
 ## Getting Started
 
 The DataFog library provides functionality for text and image processing, including PII (Personally Identifiable Information) annotation and OCR (Optical Character Recognition) capabilities.
@@ -54,8 +53,7 @@ pip install datafog
 
 ### Usage
 
-The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb)  features a standalone Colab notebook. 
-
+The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb) features a standalone Colab notebook.
 
 #### Text PII Annotation
 
@@ -75,7 +73,9 @@ with open(os.path.join(folder_path, text_files[0]), 'r') as file:
 
 display(Markdown(clinical_note))
 ```
+
 which looks like this:
+
 ```
 
 **Date:** April 10, 2024
@@ -124,7 +124,6 @@ loop = asyncio.get_event_loop()
 results = loop.run_until_complete(run_text_pipeline_demo())
 ```
 
-
 Note: The DataFog library uses asynchronous programming, so make sure to use the `async`/`await` syntax when calling the appropriate methods.
 
 #### OCR PII Annotation
@@ -146,7 +145,7 @@ loop.run_until_complete(run_ocr_pipeline_demo())
 
 ```
 
-You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses. 
+You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses.
 
 ## Contributing
 

diff --git a/datafog/__about__.py b/datafog/__about__.py
@@ -1 +1 @@
-__version__ = "3.2.1b2"
+__version__ = "3.2.1b3"
diff --git a/datafog/__init__.py b/datafog/__init__.py
@@ -1,3 +1,4 @@
+from .__about__ import __version__
 from .config import OperationType
 from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
 from .processing.image_processing.donut_processor import DonutProcessor
@@ -7,8 +8,7 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
-from .__about__ import __version__
+from .telemetry import Telemetry
 
 __all__ = [
     "DonutProcessor",
@@ -23,4 +23,5 @@
     "ImageDownloader",
     "PytesseractProcessor",
     "__version__",
+    "Telemetry",
 ]
diff --git a/datafog/config.py b/datafog/config.py
@@ -1,5 +1,6 @@
 from enum import Enum
 
+
 class OperationType(str, Enum):
     ANNOTATE_PII = "annotate_pii"
     EXTRACT_TEXT = "extract_text"

diff --git a/datafog/main.py b/datafog/main.py
@@ -13,7 +13,34 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+import os
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from azure.monitor.opentelemetry.exporter import AzureMonitorTraceExporter
+from azure.monitor.opentelemetry import configure_azure_monitor
+import platform
+from opentelemetry.trace import Status, StatusCode
+
+# Use environment variable if available, otherwise fall back to hardcoded value
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from logging import INFO, getLogger
+from dotenv import load_dotenv
+import logging
+
+load_dotenv()  # Load environment variables from .env file
+APPLICATIONINSIGHTS_CONNECTION_STRING = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
+configure_azure_monitor(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.set_tracer_provider(TracerProvider())
+exporter = AzureMonitorTraceExporter(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(exporter))
+logger = logging.getLogger("datafog_logger")
+logger.setLevel(INFO)
 
 class DataFog:
     def __init__(
@@ -27,23 +54,52 @@ def __init__(
         self.text_service = text_service
         self.spark_service: SparkService = spark_service
         self.operations: List[OperationType] = operations
+        self.logger = logging.getLogger(__name__)
+        self.logger.info("Initializing DataFog class with the following services and operations:")
+        self.logger.info(f"Image Service: {type(image_service)}")
+        self.logger.info(f"Text Service: {type(text_service)}")
+        self.logger.info(f"Spark Service: {type(spark_service) if spark_service else 'None'}")
+        self.logger.info(f"Operations: {operations}")
+        self.tracer = trace.get_tracer(__name__)
 
     async def run_ocr_pipeline(self, image_urls: List[str]):
         """Run the OCR pipeline asynchronously."""
-        extracted_text = await self.image_service.ocr_extract(image_urls)
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(
-                extracted_text
-            )
-            return annotated_text
-        return extracted_text
-
+        with self.tracer.start_as_current_span("run_ocr_pipeline") as span:
+            try:
+                extracted_text = await self.image_service.ocr_extract(image_urls)
+                self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
+                self.logger.debug(f"Total length of extracted text: {sum(len(text) for text in extracted_text)}")
+
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(extracted_text)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+
+                return extracted_text
+            except Exception as e:
+                self.logger.error(f"Error in run_ocr_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
     async def run_text_pipeline(self, texts: List[str]):
         """Run the text pipeline asynchronously."""
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(texts)
-            return annotated_text
-        return texts
+        with self.tracer.start_as_current_span("run_text_pipeline") as span:
+            try:
+                self.logger.info(f"Starting text pipeline with {len(texts)} texts.")
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(texts)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+
+                self.logger.info("No annotation operation found; returning original texts.")
+                return texts
+            except Exception as e:
+                self.logger.error(f"Error in run_text_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
+    def _add_attributes(self, span, attributes: dict):
+        """Add multiple attributes to a span."""
+        for key, value in attributes.items():
+            span.set_attribute(key, value)
 
 
 class OCRPIIAnnotator:

diff --git a/datafog/processing/__init__.py b/datafog/processing/__init__.py
@@ -1,7 +1,7 @@
 from .image_processing.donut_processor import DonutProcessor
 from .image_processing.image_downloader import ImageDownloader
 from .image_processing.pytesseract_processor import PytesseractProcessor
+
 # from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
 from .spark_processing import get_pyspark_udfs
 from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator
-
diff --git a/datafog/processing/spark_processing/__init__.py b/datafog/processing/spark_processing/__init__.py
@@ -1,5 +1,7 @@
 # from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
 
+
 def get_pyspark_udfs():
     from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+
     return broadcast_pii_annotator_udf, pii_annotator
diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py
@@ -1,9 +1,10 @@
-import requests
-import spacy
 import importlib
 import subprocess
 import sys
 
+import requests
+import spacy
+
 PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
 MAXIMAL_STRING_SIZE = 1000000
 
@@ -40,13 +41,14 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
 
 
 def broadcast_pii_annotator_udf(
-    spark_session = None, spacy_model: str = "en_spacy_pii_fast"
+    spark_session=None, spacy_model: str = "en_spacy_pii_fast"
 ):
     """Broadcast PII annotator across Spark cluster and create UDF"""
     ensure_installed("pyspark")
     from pyspark.sql import SparkSession
     from pyspark.sql.functions import udf
     from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+
     if not spark_session:
         spark_session = SparkSession.builder.getOrCreate()
     broadcasted_nlp = spark_session.sparkContext.broadcast(spacy.load(spacy_model))
@@ -62,6 +64,4 @@ def ensure_installed(self, package_name):
     try:
         importlib.import_module(package_name)
     except ImportError:
-        subprocess.check_call(
-            [sys.executable, "-m", "pip", "install", package_name]
-        )
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
diff --git a/datafog/services/spark_service.py b/datafog/services/spark_service.py
@@ -1,9 +1,8 @@
-import json
-from typing import Any, List
 import importlib
+import json
 import subprocess
 import sys
-
+from typing import Any, List
 
 
 class SparkService:
@@ -14,6 +13,7 @@ def __init__(self):
         from pyspark.sql import DataFrame, SparkSession
         from pyspark.sql.functions import udf
         from pyspark.sql.types import ArrayType, StringType
+
         self.SparkSession = SparkSession
         self.DataFrame = DataFrame
         self.udf = udf
@@ -22,7 +22,7 @@ def __init__(self):
 
     def create_spark_session(self):
         return self.SparkSession.builder.appName("datafog").getOrCreate()
-    
+
     def read_json(self, path: str) -> List[dict]:
         return self.spark.read.json(path).collect()
 
@@ -33,4 +33,3 @@ def ensure_installed(self, package_name):
             subprocess.check_call(
                 [sys.executable, "-m", "pip", "install", package_name]
             )
-
diff --git a/datafog/telemetry/__init__.py b/datafog/telemetry/__init__.py
@@ -0,0 +1 @@
+from .open_telemetry import Telemetry
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		APPLICATIONINSIGHTS_CONNECTION_STRING="InstrumentationKey=00bea047-1836-46fa-9652-26d43d63a3fa;IngestionEndpoint=https://eastus-8.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=959cc365-c112-491b-af69-b196d0943ca4"


		# note this is an Azure specific implementation of the OpenTelemetry distro. for more information please see https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry