Add quickstarted

herberteuler · Nov 28, 2022 · 506270d · 506270d
1 parent 6b50167
commit 506270d
Showing 1 changed file with 19 additions and 155 deletions.
diff --git a/notebooks/models/Evaluate azure text analytics.ipynb b/notebooks/models/Evaluate azure text analytics.ipynb
@@ -4,31 +4,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework\n",
+    "Evaluate Azure Cognitive Services for languages to identify PII using the Presidio Evaluator framework\n",
     "\n",
     "Prerequisites: \n",
-    " - Azure subscription\n",
+    " - Azure subscription - [Create one for free](https://azure.microsoft.com/en-us/free/cognitive-services/)\n",
     " - Once you have your Azure subscription, create a Language resource in the Azure portal to get your key and endpoint. After it deploys, click Go to resource.\n",
     " - You'll need the key and endpoint from the resource you create to connect your application to the API. You'll paste your key and endpoint into the code below later in the quickstart.\n",
     " - You can use the free pricing tier (Free F0) to try the service, and upgrade later to a paid tier for production.\n",
-    " - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier."
+    " - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier.\n",
+    "\n",
+    "Azure Cognitive Services for languages quickstart: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/quickstart?pivots=programming-language-python"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "stanza and spacy_stanza are not installed\n",
-      "Flair is not installed by default\n",
-      "Flair is not installed\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "from copy import deepcopy\n",
@@ -58,45 +50,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tokenizing input:   0%|          | 0/1500 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading model en_core_web_sm\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tokenizing input: 100%|██████████| 1500/1500 [00:12<00:00, 119.46it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1500\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset_name = \"synth_dataset_v2.json\"\n",
     "dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
@@ -105,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,48 +80,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Count per entity:\n",
-      "[('O', 19626),\n",
-      " ('STREET_ADDRESS', 3071),\n",
-      " ('PERSON', 1369),\n",
-      " ('GPE', 521),\n",
-      " ('ORGANIZATION', 504),\n",
-      " ('PHONE_NUMBER', 350),\n",
-      " ('DATE_TIME', 219),\n",
-      " ('TITLE', 142),\n",
-      " ('CREDIT_CARD', 136),\n",
-      " ('US_SSN', 80),\n",
-      " ('AGE', 74),\n",
-      " ('NRP', 55),\n",
-      " ('ZIP_CODE', 50),\n",
-      " ('EMAIL_ADDRESS', 49),\n",
-      " ('DOMAIN_NAME', 37),\n",
-      " ('IP_ADDRESS', 22),\n",
-      " ('IBAN_CODE', 21),\n",
-      " ('US_DRIVER_LICENSE', 9)]\n",
-      "\n",
-      "Example sentence:\n",
-      "Full text: What are my options?\n",
-      "Spans: []\n",
-      "Tokens: What are my options?\n",
-      "Tags: ['O', 'O', 'O', 'O', 'O']\n",
-      "\n",
-      "\n",
-      "Min and max number of tokens in dataset:\n",
-      "Min: 3, Max: 78\n",
-      "\n",
-      "Min and max sentence length in dataset:\n",
-      "Min: 9, Max: 407\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Count per entity:\")\n",
     "pprint(entity_counter.most_common())\n",
@@ -195,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -208,38 +125,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Evaluating Azure Text Analytics.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "saving experiment data to experiment_20221128-094558.json\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Evaluating Azure Text Analytics.\")\n",
     "\n",
@@ -289,43 +177,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Confusion matrix:\n",
-      "         Address  O\n",
-      "Address        6  8\n",
-      "O              0  5\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Confusion matrix:\")\n",
     "print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Precision and recall\n",
-      "              Entity           Precision              Recall   Number of samples\n",
-      "             Address             100.00%              42.86%                  14\n",
-      "        Organization                nan%               0.00%                   1\n",
-      "                 PII             100.00%              40.00%                  15\n",
-      "PII F measure: 43.61%\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Precision and recall\")\n",
     "print(results)"