Skip to content

Commit

Permalink
Add quickstarted
Browse files Browse the repository at this point in the history
  • Loading branch information
tranguyen221 committed Nov 28, 2022
1 parent 6b50167 commit 506270d
Showing 1 changed file with 19 additions and 155 deletions.
174 changes: 19 additions & 155 deletions notebooks/models/Evaluate azure text analytics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,23 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework\n",
"Evaluate Azure Cognitive Services for languages to identify PII using the Presidio Evaluator framework\n",
"\n",
"Prerequisites: \n",
" - Azure subscription\n",
" - Azure subscription - [Create one for free](https://azure.microsoft.com/en-us/free/cognitive-services/)\n",
" - Once you have your Azure subscription, create a Language resource in the Azure portal to get your key and endpoint. After it deploys, click Go to resource.\n",
" - You'll need the key and endpoint from the resource you create to connect your application to the API. You'll paste your key and endpoint into the code below later in the quickstart.\n",
" - You can use the free pricing tier (Free F0) to try the service, and upgrade later to a paid tier for production.\n",
" - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier."
" - To use the Analyze feature, you'll need a Language resource with the standard (S) pricing tier.\n",
"\n",
"Azure Cognitive Services for languages quickstart: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/quickstart?pivots=programming-language-python"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stanza and spacy_stanza are not installed\n",
"Flair is not installed by default\n",
"Flair is not installed\n"
]
}
],
"outputs": [],
"source": [
"from pathlib import Path\n",
"from copy import deepcopy\n",
Expand Down Expand Up @@ -58,45 +50,9 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 0%| | 0/1500 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|██████████| 1500/1500 [00:12<00:00, 119.46it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1500\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"dataset_name = \"synth_dataset_v2.json\"\n",
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
Expand All @@ -105,7 +61,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -124,48 +80,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count per entity:\n",
"[('O', 19626),\n",
" ('STREET_ADDRESS', 3071),\n",
" ('PERSON', 1369),\n",
" ('GPE', 521),\n",
" ('ORGANIZATION', 504),\n",
" ('PHONE_NUMBER', 350),\n",
" ('DATE_TIME', 219),\n",
" ('TITLE', 142),\n",
" ('CREDIT_CARD', 136),\n",
" ('US_SSN', 80),\n",
" ('AGE', 74),\n",
" ('NRP', 55),\n",
" ('ZIP_CODE', 50),\n",
" ('EMAIL_ADDRESS', 49),\n",
" ('DOMAIN_NAME', 37),\n",
" ('IP_ADDRESS', 22),\n",
" ('IBAN_CODE', 21),\n",
" ('US_DRIVER_LICENSE', 9)]\n",
"\n",
"Example sentence:\n",
"Full text: What are my options?\n",
"Spans: []\n",
"Tokens: What are my options?\n",
"Tags: ['O', 'O', 'O', 'O', 'O']\n",
"\n",
"\n",
"Min and max number of tokens in dataset:\n",
"Min: 3, Max: 78\n",
"\n",
"Min and max sentence length in dataset:\n",
"Min: 9, Max: 407\n"
]
}
],
"outputs": [],
"source": [
"print(\"Count per entity:\")\n",
"pprint(entity_counter.most_common())\n",
Expand Down Expand Up @@ -195,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -208,38 +125,9 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluating Azure Text Analytics.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1/1 [00:00<00:00, 4.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"saving experiment data to experiment_20221128-094558.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"print(\"Evaluating Azure Text Analytics.\")\n",
"\n",
Expand Down Expand Up @@ -289,43 +177,19 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion matrix:\n",
" Address O\n",
"Address 6 8\n",
"O 0 5\n"
]
}
],
"outputs": [],
"source": [
"print(\"Confusion matrix:\")\n",
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Precision and recall\n",
" Entity Precision Recall Number of samples\n",
" Address 100.00% 42.86% 14\n",
" Organization nan% 0.00% 1\n",
" PII 100.00% 40.00% 15\n",
"PII F measure: 43.61%\n"
]
}
],
"outputs": [],
"source": [
"print(\"Precision and recall\")\n",
"print(results)"
Expand Down

0 comments on commit 506270d

Please sign in to comment.