DataFog
diff --git a/‎examples/datafog_python_v3_examples.ipynb
Lines changed: 260 additions & 0 deletions b/‎examples/datafog_python_v3_examples.ipynb
Lines changed: 260 additions & 0 deletions
@@ -0,0 +1,260 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WNtUZ497_0kd"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install 'datafog==3.0.1'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Example: Annotating PII from text\n"
+      ],
+      "metadata": {
+        "id": "GWl2SrygBvM8"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Setup"
+      ],
+      "metadata": {
+        "id": "0p1U8dU8KBS3"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pyspark.sql import SparkSession\n",
+        "spark = SparkSession.builder \\\n",
+        "    .appName(\"DataFog\") \\\n",
+        "    .config(\"spark.driver.memory\", \"8g\") \\\n",
+        "    .config(\"spark.executor.memory\", \"8g\") \\\n",
+        "    .getOrCreate()"
+      ],
+      "metadata": {
+        "id": "fwv9QpGIEuAn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "DVd1AtvqIkuA"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Spark Functions to broadcast over DataFrame"
+      ],
+      "metadata": {
+        "id": "2u8MhJAPImW0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pyspark.sql import SparkSession\n",
+        "from pyspark.sql.functions import udf\n",
+        "from pyspark.sql.types import ArrayType, StringType\n",
+        "from pyspark.sql.types import StructType, StructField, StringType\n",
+        "import spacy\n",
+        "import requests\n",
+        "\n",
+        "PII_ANNOTATION_LABELS = [\"DATE_TIME\", \"LOC\", \"NRP\", \"ORG\", \"PER\"]\n",
+        "MAXIMAL_STRING_SIZE = 1000000\n",
+        "\n",
+        "def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:\n",
+        "    \"\"\"Extract features using en_spacy_pii_fast model.\n",
+        "\n",
+        "    Returns:\n",
+        "        list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.\n",
+        "    \"\"\"\n",
+        "    if text:\n",
+        "        if len(text) > MAXIMAL_STRING_SIZE:\n",
+        "            # Cut the strings for required sizes\n",
+        "            text = text[:MAXIMAL_STRING_SIZE]\n",
+        "        nlp = broadcasted_nlp.value\n",
+        "        doc = nlp(text)\n",
+        "\n",
+        "        # Pre-create dictionary with labels matching to expected extracted entities\n",
+        "        classified_entities: dict[str, list[str]] = {\n",
+        "            _label: [] for _label in PII_ANNOTATION_LABELS\n",
+        "        }\n",
+        "        for ent in doc.ents:\n",
+        "            # Add entities from extracted values\n",
+        "            classified_entities[ent.label_].append(ent.text)\n",
+        "\n",
+        "        return [_ent for _ent in classified_entities.values()]\n",
+        "    else:\n",
+        "        return [[] for _ in PII_ANNOTATION_LABELS]\n",
+        "\n",
+        "def broadcast_pii_annotator_udf(spark_session: SparkSession, spacy_model: str = \"en_spacy_pii_fast\"):\n",
+        "    \"\"\"Broadcast PII annotator across Spark cluster and create UDF\"\"\"\n",
+        "    broadcasted_nlp = spark_session.sparkContext.broadcast(\n",
+        "        spacy.load(spacy_model)\n",
+        "    )\n",
+        "\n",
+        "    pii_annotation_udf = udf(\n",
+        "        lambda text: pii_annotator(text, broadcasted_nlp),\n",
+        "        ArrayType(ArrayType(StringType())),\n",
+        "    )\n",
+        "    return pii_annotation_udf"
+      ],
+      "metadata": {
+        "id": "H-q24tYIF-Bw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sotu_url = 'https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt'\n",
+        "\n",
+        "# Fetch the content of the text file\n",
+        "response = requests.get(sotu_url)\n",
+        "sotu_text = response.text\n",
+        "\n",
+        "# Create a DataFrame from the text data\n",
+        "df = spark.createDataFrame([(line,) for line in sotu_text.split('\\n') if line], [\"text\"])\n",
+        "df.show()\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-GzGCpA6JKqB",
+        "outputId": "491e31ea-d965-4d8b-8a58-c6e202d6d01b"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+--------------------+\n",
+            "|                text|\n",
+            "+--------------------+\n",
+            "|Mr. Speaker, Mada...|\n",
+            "|And, by the way, ...|\n",
+            "|Members of the Ca...|\n",
+            "|You know, I start...|\n",
+            "|Speaker, I don’t ...|\n",
+            "|And I want to con...|\n",
+            "|He won despite th...|\n",
+            "|Congratulations t...|\n",
+            "|And congratulatio...|\n",
+            "|Well, I tell you ...|\n",
+            "|Folks, the story ...|\n",
+            "|We’re the only co...|\n",
+            "|Look, folks, that...|\n",
+            "|Two years ago, th...|\n",
+            "|Two years ago — a...|\n",
+            "|And two years ago...|\n",
+            "|As we gather here...|\n",
+            "|When world leader...|\n",
+            "|You know, we’re o...|\n",
+            "|Yes, we disagreed...|\n",
+            "+--------------------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Feature Extraction"
+      ],
+      "metadata": {
+        "id": "6SkvKcgKJ79m"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "extract_features_udf = broadcast_pii_annotator_udf(spark, spacy_model=\"en_spacy_pii_fast\")\n",
+        "\n",
+        "df = df.withColumn(\"en_spacy_pii_fast\", extract_features_udf(df.text))\n",
+        "df.show(truncate=False)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "euhCyV-4JoU0",
+        "outputId": "99e883a3-3e02-497c-bd58-4b50ac3996d4"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+\n",
+            "|text                                                                                                                                                                                                                                                                                 |en_spacy_pii_fast                                                        |\n",
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+\n",
+            "|Mr. Speaker, Madam Vice President, our First Lady and Second Gentleman — good to see you guys up there — members of Congress —                                                                                                                                                       |[[], [Madam], [], [First Lady, Congress], [Speaker, Second Gentleman]]   |\n",
+            "|And, by the way, Chief Justice, I may need a court order. She gets to go to the game tomorr- — next week. I have to stay home. We got to work something out here.                                                                                                                    |[[next week], [], [], [], [Chief Justice]]                               |\n",
+            "|Members of the Cabinet, leaders of our military, Chief Justice, Associate Justices, and retired Justices of the Supreme Court, and to you, my fellow Americans:                                                                                                                      |[[], [], [Americans], [Cabinet, Chief Justice, the Supreme Court], []]   |\n",
+            "|You know, I start tonight by congratulating the 118th Congress and the new Speaker of the House, Kevin McCarthy.                                                                                                                                                                     |[[tonight], [], [], [Congress, House], [Kevin McCarthy]]                 |\n",
+            "|Speaker, I don’t want to ruin your reputation, but I look forward to working with you.                                                                                                                                                                                               |[[], [], [], [], []]                                                     |\n",
+            "|And I want to congratulate the new Leader of the House Democrats, the first African American Minority Leader in history, Hakeem Jeffries.                                                                                                                                            |[[], [], [Democrats, African American], [House, Hakeem Jeffries], []]    |\n",
+            "|He won despite the fact I campaigned for him.                                                                                                                                                                                                                                        |[[], [], [], [], []]                                                     |\n",
+            "|Congratulations to the longest-serving Leader in the history of the United States Senate, Mitch McConnell. Where are you, Mitch?                                                                                                                                                     |[[], [the United States Senate], [], [Leader], [Mitch McConnell, Mitch]] |\n",
+            "|And congratulations to Chuck Schumer, another — you know, another term as Senate Minority [Majority] Leader. You know, I think you — only this time you have a slightly bigger majority, Mr. Leader. And you’re the Majority Leader. About that much bigger? Yeah.                   |[[], [Yeah], [], [Senate, Leader], [Chuck Schumer, Leader]]              |\n",
+            "|Well, I tell you what — I want to give specolec- — special recognition to someone who I think is going to be considered the greatest Speaker in the history of the House of Representatives: Nancy Pelosi.                                                                           |[[], [], [], [the House of Representatives], [Nancy Pelosi]]             |\n",
+            "|Folks, the story of America is a story of progress and resilience, of always moving forward, of never, ever giving up. It’s a story unique among all nations.                                                                                                                        |[[], [America], [], [], []]                                              |\n",
+            "|We’re the only country that has emerged from every crisis we’ve ever entered stronger than we got into it.                                                                                                                                                                           |[[], [], [], [], []]                                                     |\n",
+            "|Look, folks, that’s what we’re doing again.                                                                                                                                                                                                                                          |[[], [], [], [], []]                                                     |\n",
+            "|Two years ago, the economy was reeling. I stand here tonight, after we’ve created, with the help of many people in this room, 12 million new jobs — more jobs created in two years than any President has created in four years — because of you all, because of the American people.|[[Two years ago, tonight, two years, four years], [], [American], [], []]|\n",
+            "|Two years ago — and two years ago, COVID had shut down — our businesses were closed, our schools were robbed of so much. And today, COVID no longer controls our lives.                                                                                                              |[[Two years ago, two years ago, today], [], [], [COVID], []]             |\n",
+            "|And two years ago, our democracy faced its greatest threat since the Civil War. And today, though bruised, our democracy remains unbowed and unbroken.                                                                                                                               |[[two years ago, today], [], [], [], []]                                 |\n",
+            "|As we gather here tonight, we’re writing the next chapter in the great American story — a story of progress and resilience.                                                                                                                                                          |[[tonight], [], [American], [], []]                                      |\n",
+            "|When world leaders ask me to define America — and they do, believe it or not — I say I can define it in one word, and I mean this: possibilities. We don’t think anything is beyond our capacity. Everything is a possibility.                                                       |[[], [America], [], [], []]                                              |\n",
+            "|You know, we’re often told that Democrats and Republicans can’t work together. But over the past two years, we proved the cynics and naysayers wrong.                                                                                                                                |[[the past two years], [], [Democrats, Republicans], [], []]             |\n",
+            "|Yes, we disagreed plenty. And yes, there were times when Democrats went alone.                                                                                                                                                                                                       |[[], [], [Democrats], [], []]                                            |\n",
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#"
+      ],
+      "metadata": {
+        "id": "-Abubt0jKPRD"
+      }
+    }
+  ]
+}