Skip to content

Commit

Permalink
notebooks update
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 committed Apr 28, 2021
1 parent 3df6e1a commit 3e335a6
Show file tree
Hide file tree
Showing 28 changed files with 395 additions and 334 deletions.
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ steps:
- script: |
python -m pip install --upgrade pip
pip install -r requirements.txt
python m spacy download en_core_web_lg
python -m spacy download en_core_web_lg
displayName: 'Install dependencies'

Expand Down
2 changes: 0 additions & 2 deletions models/__init__.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,35 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"from presidio_evaluator.data_generator import read_synth_dataset\n",
"from presidio_evaluator import ModelEvaluator\n",
"from presidio_evaluator.evaluation import ModelError, Evaluator\n",
"from presidio_evaluator.models import BaseModel, PresidioAnalyzerWrapper\n",
"from collections import Counter\n",
"\n",
"import pandas as pd\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"MY_PRESIDIO_ENDPOINT = \"http://presidio-api.westeurope.cloudapp.azure.com/api/v1/projects/test/analyze\""
"pd.options.display.max_columns = None\n",
"pd.options.display.width=None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate your Presidio instance via the Presidio API"
"# Evaluate Presidio Analyzer\n",
"This notebook runs the PresidioAnalyzerEvaluator class on top of synthetic data.\n",
"\n",
"One can perform the following changes:\n",
"1. Replace the synthetic data creation with real data or with other type of synthetic data\n",
"2. Adapt the Presidio `AnalyzerEngine` to a specific engine with a different set of recognizers or configured to be used on different languages\n",
"\n",
"\n"
]
},
{
Expand All @@ -37,15 +44,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"input_samples = read_synth_dataset(\"../data/synth_dataset.txt\")\n",
"print(\"Read {} samples\".format(len(input_samples)))"
"print(\"Read {} samples\".format(len(input_samples)))\n",
"input_samples[0]"
]
},
{
Expand All @@ -58,11 +62,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"flatten = lambda l: [item for sublist in l for item in sublist]\n",
Expand All @@ -84,32 +84,29 @@
"metadata": {},
"outputs": [],
"source": [
"# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity\n",
"entities_mapping = {\n",
" 'PERSON': 'PERSON',\n",
" 'EMAIL': 'EMAIL_ADDRESS',\n",
" 'CREDIT_CARD': 'CREDIT_CARD',\n",
" 'FIRST_NAME': 'PERSON',\n",
" 'PHONE_NUMBER': 'PHONE_NUMBER',\n",
" 'LOCATION':'LOCATION',\n",
" # 'BIRTHDAY': 'DATE_TIME',\n",
" # 'DATE': 'DATE_TIME',\n",
" 'DOMAIN': 'DOMAIN',\n",
" # 'CITY': 'LOCATION',\n",
" # 'ADDRESS': 'LOCATION',\n",
" 'IBAN': 'IBAN_CODE',\n",
" # 'URL': 'DOMAIN_NAME',\n",
" 'US_SSN': 'US_SSN',\n",
" 'IP_ADDRESS': 'IP_ADDRESS',\n",
" # 'ORGANIZATION':'ORG'\n",
" 'O': 'O'\n",
"presidio_entities_map = {\n",
" \"PERSON\": \"PERSON\",\n",
" \"EMAIL_ADDRESS\": \"EMAIL_ADDRESS\",\n",
" \"CREDIT_CARD\": \"CREDIT_CARD\",\n",
" \"FIRST_NAME\": \"PERSON\",\n",
" \"PHONE_NUMBER\": \"PHONE_NUMBER\",\n",
" \"BIRTHDAY\": \"DATE_TIME\",\n",
" \"DATE_TIME\": \"DATE_TIME\",\n",
" \"DOMAIN\": \"DOMAIN\",\n",
" \"CITY\": \"LOCATION\",\n",
" \"ADDRESS\": \"LOCATION\",\n",
" \"NATIONALITY\": \"LOCATION\",\n",
" \"LOCATION\": \"LOCATION\",\n",
" \"IBAN\": \"IBAN_CODE\",\n",
" \"URL\": \"DOMAIN_NAME\",\n",
" \"US_SSN\": \"US_SSN\",\n",
" \"IP_ADDRESS\": \"IP_ADDRESS\",\n",
" \"ORGANIZATION\": \"ORG\",\n",
" \"TITLE\" : \"O\", # skipping evaluation of titles\n",
" \"O\": \"O\",\n",
"}\n",
"presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE',\n",
" 'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']\n",
"\n",
"new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,\n",
" entities_mapping,\n",
" presidio_fields)"
"new_list = Evaluator.align_entity_types(input_samples, presidio_entities_map)"
]
},
{
Expand Down Expand Up @@ -143,9 +140,9 @@
"metadata": {},
"outputs": [],
"source": [
"from presidio_evaluator import PresidioAPIEvaluator\n",
"presidio = PresidioAPIEvaluator(entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)\n",
"evaluted_samples = presidio.evaluate_all(new_list[:100])"
"presidio = PresidioAnalyzerWrapper(entities_to_keep=list(count_per_entity_new.keys()))\n",
"evaluator = Evaluator(model=presidio)\n",
"evaluted_samples = evaluator.evaluate_all(new_list[:100])"
]
},
{
Expand All @@ -163,7 +160,7 @@
"metadata": {},
"outputs": [],
"source": [
"evaluation_result = presidio.calculate_score(evaluted_samples)"
"evaluation_result = evaluator.calculate_score(evaluted_samples)"
]
},
{
Expand Down Expand Up @@ -197,7 +194,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelEvaluator.most_common_fp_tokens(errors,n=5)"
"ModelError.most_common_fp_tokens(errors,n=5)"
]
},
{
Expand All @@ -210,8 +207,9 @@
},
"outputs": [],
"source": [
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')\n",
"fps_df[['full_text','token','prediction']]"
"fps_df = ModelError.get_fps_dataframe(errors,entity='PERSON')\n",
"if fps_df is not None:\n",
" fps_df[['full_text','token','prediction']]"
]
},
{
Expand All @@ -220,16 +218,30 @@
"metadata": {},
"outputs": [],
"source": [
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')\n",
"fns_df = ModelError.get_fns_dataframe(errors,entity='PERSON')\n",
"fns_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"name": "pycharm-c8930cf3",
"display_name": "presidio-research",
"language": "python",
"display_name": "PyCharm (presidio-research)"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -241,16 +253,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,
Expand Down
17 changes: 4 additions & 13 deletions notebooks/Generate data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -218,18 +218,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading

0 comments on commit 3e335a6

Please sign in to comment.