From 506270d32c3ab052e81d1a1281d950d6d3776559 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 28 Nov 2022 11:14:05 +0100
Subject: [PATCH] Add quickstarted

---
 .../Evaluate azure text analytics.ipynb       | 174 ++----------------
 1 file changed, 19 insertions(+), 155 deletions(-)

diff --git a/notebooks/models/Evaluate azure text analytics.ipynb b/notebooks/models/Evaluate azure text analytics.ipynb
index af74d75..f7f122d 100644
--- a/notebooks/models/Evaluate azure text analytics.ipynb	
+++ b/notebooks/models/Evaluate azure text analytics.ipynb	
@@ -4,31 +4,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework\n",
+    "Evaluate Azure Cognitive Services for languages to identify PII using the Presidio Evaluator framework\n",
     "\n",
     "Prerequisites: \n",
-    " - Azure subscription\n",
+    " - Azure subscription - [Create one for free](https://azure.microsoft.com/en-us/free/cognitive-services/)\n",
     " - Once you have your Azure subscription, create a Language resource in the Azure portal to get your key and endpoint. After it deploys, click Go to resource.\n",
     " - You'll need the key and endpoint from the resource you create to connect your application to the API. You'll paste your key and endpoint into the code below later in the quickstart.\n",
     " - You can use the free pricing tier (Free F0) to try the service, and upgrade later to a paid tier for production.\n",
-    " - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier."
+    " - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier.\n",
+    "\n",
+    "Azure Cognitive Services for languages quickstart: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/quickstart?pivots=programming-language-python"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "stanza and spacy_stanza are not installed\n",
-      "Flair is not installed by default\n",
-      "Flair is not installed\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "from copy import deepcopy\n",
@@ -58,45 +50,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tokenizing input:   0%|          | 0/1500 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading model en_core_web_sm\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tokenizing input: 100%|██████████| 1500/1500 [00:12<00:00, 119.46it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1500\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset_name = \"synth_dataset_v2.json\"\n",
     "dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
@@ -105,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,48 +80,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Count per entity:\n",
-      "[('O', 19626),\n",
-      " ('STREET_ADDRESS', 3071),\n",
-      " ('PERSON', 1369),\n",
-      " ('GPE', 521),\n",
-      " ('ORGANIZATION', 504),\n",
-      " ('PHONE_NUMBER', 350),\n",
-      " ('DATE_TIME', 219),\n",
-      " ('TITLE', 142),\n",
-      " ('CREDIT_CARD', 136),\n",
-      " ('US_SSN', 80),\n",
-      " ('AGE', 74),\n",
-      " ('NRP', 55),\n",
-      " ('ZIP_CODE', 50),\n",
-      " ('EMAIL_ADDRESS', 49),\n",
-      " ('DOMAIN_NAME', 37),\n",
-      " ('IP_ADDRESS', 22),\n",
-      " ('IBAN_CODE', 21),\n",
-      " ('US_DRIVER_LICENSE', 9)]\n",
-      "\n",
-      "Example sentence:\n",
-      "Full text: What are my options?\n",
-      "Spans: []\n",
-      "Tokens: What are my options?\n",
-      "Tags: ['O', 'O', 'O', 'O', 'O']\n",
-      "\n",
-      "\n",
-      "Min and max number of tokens in dataset:\n",
-      "Min: 3, Max: 78\n",
-      "\n",
-      "Min and max sentence length in dataset:\n",
-      "Min: 9, Max: 407\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Count per entity:\")\n",
     "pprint(entity_counter.most_common())\n",
@@ -195,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -208,38 +125,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Evaluating Azure Text Analytics.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "saving experiment data to experiment_20221128-094558.json\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Evaluating Azure Text Analytics.\")\n",
     "\n",
@@ -289,20 +177,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Confusion matrix:\n",
-      "         Address  O\n",
-      "Address        6  8\n",
-      "O              0  5\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Confusion matrix:\")\n",
     "print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
@@ -310,22 +187,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Precision and recall\n",
-      "              Entity           Precision              Recall   Number of samples\n",
-      "             Address             100.00%              42.86%                  14\n",
-      "        Organization                nan%               0.00%                   1\n",
-      "                 PII             100.00%              40.00%                  15\n",
-      "PII F measure: 43.61%\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Precision and recall\")\n",
     "print(results)"