diff --git a/README.md b/README.md
index 295447f..15d7fb4 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans
Once data is generated, it could be split into train/test/validation sets
while ensuring that each template only exists in one set.
-See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
+See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).
## 2. Data representation
diff --git a/VERSION b/VERSION
index d917d3e..b1e80bb 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.2
+0.1.3
diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
index 98b5686..7272802 100644
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@@ -2,8 +2,10 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
+ "execution_count": 1,
+ "metadata": {
+ "is_executing": true
+ },
"outputs": [],
"source": [
"# install presidio via pip if not yet installed\n",
@@ -14,8 +16,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
+ "is_executing": true,
"scrolled": true
},
"outputs": [],
@@ -81,9 +84,34 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 3,
+ "metadata": {
+ "is_executing": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "My name is Joshua Jackson\n",
+ "[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
"sentence_templates = [\n",
" \"My name is {{name}}\",\n",
@@ -126,8 +154,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
+ "is_executing": true,
"scrolled": true
},
"outputs": [],
@@ -165,13 +194,228 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " number | \n",
+ " gender | \n",
+ " nationality | \n",
+ " prefix | \n",
+ " first_name | \n",
+ " middle_initial | \n",
+ " last_name | \n",
+ " street_name | \n",
+ " city | \n",
+ " state_abbr | \n",
+ " ... | \n",
+ " company | \n",
+ " domain_name | \n",
+ " person | \n",
+ " name | \n",
+ " first_name_female | \n",
+ " first_name_male | \n",
+ " prefix_female | \n",
+ " prefix_male | \n",
+ " last_name_female | \n",
+ " last_name_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " female | \n",
+ " Czech | \n",
+ " Mrs. | \n",
+ " Marie | \n",
+ " J | \n",
+ " Hamanová | \n",
+ " P.O. Box 255 | \n",
+ " Kangerlussuaq | \n",
+ " QE | \n",
+ " ... | \n",
+ " Simple Solutions | \n",
+ " MarathonDancing.gl | \n",
+ " Marie J Hamanová | \n",
+ " Marie J Hamanová | \n",
+ " Marie | \n",
+ " | \n",
+ " Mrs. | \n",
+ " | \n",
+ " Hamanová | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " female | \n",
+ " French | \n",
+ " Ms. | \n",
+ " Patricia | \n",
+ " G | \n",
+ " Desrosiers | \n",
+ " Avenida Noruega 42 | \n",
+ " Vila Real | \n",
+ " VR | \n",
+ " ... | \n",
+ " Formula Gray | \n",
+ " LostMillions.com.pt | \n",
+ " Patricia Desrosiers | \n",
+ " Patricia Desrosiers | \n",
+ " Patricia | \n",
+ " | \n",
+ " Ms. | \n",
+ " | \n",
+ " Desrosiers | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " female | \n",
+ " American | \n",
+ " Ms. | \n",
+ " Debra | \n",
+ " O | \n",
+ " Neal | \n",
+ " 1659 Hoog St | \n",
+ " Brakpan | \n",
+ " GA | \n",
+ " ... | \n",
+ " Dahlkemper's | \n",
+ " MediumTube.co.za | \n",
+ " Debra O Neal | \n",
+ " Debra O Neal | \n",
+ " Debra | \n",
+ " | \n",
+ " Ms. | \n",
+ " | \n",
+ " Neal | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " male | \n",
+ " French | \n",
+ " Mr. | \n",
+ " Peverell | \n",
+ " C | \n",
+ " Racine | \n",
+ " 183 Epimenidou Street | \n",
+ " Limassol | \n",
+ " LI | \n",
+ " ... | \n",
+ " Quickbiz | \n",
+ " ImproveLook.com.cy | \n",
+ " Peverell Racine | \n",
+ " Peverell Racine | \n",
+ " | \n",
+ " Peverell | \n",
+ " | \n",
+ " Mr. | \n",
+ " | \n",
+ " Racine | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " female | \n",
+ " Slovenian | \n",
+ " Mrs. | \n",
+ " Iolanda | \n",
+ " S | \n",
+ " Tratnik | \n",
+ " Karu põik 61 | \n",
+ " Pärnu | \n",
+ " PR | \n",
+ " ... | \n",
+ " Dubrow's Cafeteria | \n",
+ " PostTan.com.ee | \n",
+ " Iolanda Tratnik | \n",
+ " Iolanda Tratnik | \n",
+ " Iolanda | \n",
+ " | \n",
+ " Mrs. | \n",
+ " | \n",
+ " Tratnik | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " number gender nationality prefix first_name middle_initial last_name \\\n",
+ "0 1 female Czech Mrs. Marie J Hamanová \n",
+ "1 2 female French Ms. Patricia G Desrosiers \n",
+ "2 3 female American Ms. Debra O Neal \n",
+ "3 4 male French Mr. Peverell C Racine \n",
+ "4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
+ "\n",
+ " street_name city state_abbr ... company \\\n",
+ "0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
+ "1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
+ "2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
+ "3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
+ "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
+ "\n",
+ " domain_name person name \\\n",
+ "0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n",
+ "1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n",
+ "2 MediumTube.co.za Debra O Neal Debra O Neal \n",
+ "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n",
+ "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n",
+ "\n",
+ " first_name_female first_name_male prefix_female prefix_male \\\n",
+ "0 Marie Mrs. \n",
+ "1 Patricia Ms. \n",
+ "2 Debra Ms. \n",
+ "3 Peverell Mr. \n",
+ "4 Iolanda Mrs. \n",
+ "\n",
+ " last_name_female last_name_male \n",
+ "0 Hamanová \n",
+ "1 Desrosiers \n",
+ "2 Neal \n",
+ "3 Racine \n",
+ "4 Tratnik \n",
+ "\n",
+ "[5 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Read FakeNameGenerator CSV\n",
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
@@ -190,8 +434,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
+ "is_executing": true,
"scrolled": true
},
"outputs": [],
@@ -209,8 +454,10 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
+ "execution_count": 7,
+ "metadata": {
+ "is_executing": true
+ },
"outputs": [],
"source": [
"fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n",
@@ -235,8 +482,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@@ -270,13 +518,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
"sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
"fake_records = data_generator.generate_fake_data(\n",
@@ -296,11 +567,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
+ "is_executing": true,
"scrolled": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total: 1500\n",
+ "Avg # of records per template: 7.142857142857143\n",
+ "Median # of records per template: 7.0\n",
+ "Std: 2.5872528966106905\n"
+ ]
+ }
+ ],
"source": [
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
"\n",
@@ -323,13 +606,65 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'organization': 257,\n",
+ " 'first_name': 244,\n",
+ " 'person': 238,\n",
+ " 'city': 235,\n",
+ " 'address': 209,\n",
+ " 'street_name': 164,\n",
+ " 'name': 162,\n",
+ " 'country': 154,\n",
+ " 'credit_card_number': 152,\n",
+ " 'phone_number': 121,\n",
+ " 'last_name': 119,\n",
+ " 'building_number': 110,\n",
+ " 'age': 72,\n",
+ " 'secondary_address': 64,\n",
+ " 'year': 58,\n",
+ " 'nationality': 55,\n",
+ " 'postcode': 49,\n",
+ " 'zipcode': 45,\n",
+ " 'url': 39,\n",
+ " 'email': 39,\n",
+ " 'name_female': 37,\n",
+ " 'job': 33,\n",
+ " 'first_name_male': 31,\n",
+ " 'name_male': 29,\n",
+ " 'prefix_male': 28,\n",
+ " 'date_of_birth': 24,\n",
+ " 'iban': 22,\n",
+ " 'date_time': 21,\n",
+ " 'prefix_female': 21,\n",
+ " 'day_of_week': 16,\n",
+ " 'state_abbr': 15,\n",
+ " 'last_name_male': 15,\n",
+ " 'prefix': 12,\n",
+ " 'ip_address': 11,\n",
+ " 'ssn': 11,\n",
+ " 'nation_plural': 9,\n",
+ " 'nation_woman': 8,\n",
+ " 'first_name_nonbinary': 6,\n",
+ " 'us_driver_license': 6,\n",
+ " 'first_name_female': 3,\n",
+ " 'last_name_female': 3})"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"count_per_entity = Counter()\n",
"for record in fake_records:\n",
@@ -351,8 +686,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@@ -421,9 +757,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 13,
+ "metadata": {
+ "is_executing": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"fake_records[0]"
]
@@ -437,13 +786,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('PERSON', 887),\n",
+ " ('STREET_ADDRESS', 596),\n",
+ " ('GPE', 404),\n",
+ " ('ORGANIZATION', 257),\n",
+ " ('CREDIT_CARD', 152),\n",
+ " ('PHONE_NUMBER', 121),\n",
+ " ('DATE_TIME', 119),\n",
+ " ('TITLE', 94),\n",
+ " ('NRP', 72),\n",
+ " ('AGE', 72),\n",
+ " ('ZIP_CODE', 45),\n",
+ " ('DOMAIN_NAME', 39),\n",
+ " ('EMAIL_ADDRESS', 39),\n",
+ " ('IBAN_CODE', 22),\n",
+ " ('IP_ADDRESS', 11),\n",
+ " ('US_SSN', 11),\n",
+ " ('US_DRIVER_LICENSE', 6)]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"\n",
"count_per_entity_new = Counter()\n",
@@ -463,13 +840,51 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/1500 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "loading model en_core_web_sm\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
+ "Wall time: 6.96 s\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
"%%time\n",
"input_samples = [\n",
@@ -491,8 +906,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@@ -515,21 +931,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
+ ]
+ }
+ ],
"source": [
"conll = InputSample.create_conll_dataset(input_samples)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
+ "is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@@ -546,7 +972,7 @@
"### Next steps\n",
"\n",
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
- "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
+ "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
]
},
@@ -569,9 +995,9 @@
"hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
},
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -583,9 +1009,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/2_PII_EDA.ipynb b/notebooks/2_PII_EDA.ipynb
index 8d58a63..1f409fc 100644
--- a/notebooks/2_PII_EDA.ipynb
+++ b/notebooks/2_PII_EDA.ipynb
@@ -72,7 +72,7 @@
"metadata": {},
"outputs": [],
"source": [
- "for (name, series) in pii_df.iteritems():\n",
+ "for (name, series) in pii_df.items():\n",
" print(name)\n",
" print(\"Unique values: {}\".format(len(series.unique())))\n",
" print(series.value_counts())\n",
@@ -123,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
- "series_to_wordcloud(pii_df.country_full)"
+ "series_to_wordcloud(pii_df.country)"
]
},
{
@@ -187,9 +187,9 @@
"metadata": {},
"outputs": [],
"source": [
- "countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
+ "countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
"countries = [item for sublist in countries for item in sublist]\n",
- "series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
+ "series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
]
},
{
@@ -213,9 +213,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -227,9 +227,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
+ "nbformat_minor": 4
+}
diff --git a/notebooks/3_Split_by_pattern_#.ipynb b/notebooks/3_Split_by_pattern_number.ipynb
similarity index 94%
rename from notebooks/3_Split_by_pattern_#.ipynb
rename to notebooks/3_Split_by_pattern_number.ipynb
index 51b0c1e..17252ec 100644
--- a/notebooks/3_Split_by_pattern_#.ipynb
+++ b/notebooks/3_Split_by_pattern_number.ipynb
@@ -143,13 +143,6 @@
"assert len(train) + len(test) + len(validation) == len(all_samples)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": null,
@@ -160,9 +153,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -174,9 +167,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/4_Evaluate_Presidio_Analyzer.ipynb b/notebooks/4_Evaluate_Presidio_Analyzer.ipynb
index dfa1501..ec26f97 100644
--- a/notebooks/4_Evaluate_Presidio_Analyzer.ipynb
+++ b/notebooks/4_Evaluate_Presidio_Analyzer.ipynb
@@ -5,7 +5,7 @@
"id": "847acd88",
"metadata": {},
"source": [
- "Evaluate Presidio Analyzer using the Presidio Evaluator framework"
+ "# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
]
},
{
@@ -17,7 +17,8 @@
"source": [
"# install presidio via pip if not yet installed\n",
"\n",
- "#!pip install presidio-analyzer\n",
+ "#!pip install presidio-evaluator\n",
+ "#!pip install \"presidio-analyzer[transformers]\"\n",
"#!pip install presidio-evaluator"
]
},
@@ -32,6 +33,10 @@
"from copy import deepcopy\n",
"from pprint import pprint\n",
"from collections import Counter\n",
+ "from typing import List\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
@@ -45,7 +50,8 @@
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"%reload_ext autoreload\n",
- "%autoreload 2"
+ "%autoreload 2\n",
+ "%matplotlib inline"
]
},
{
@@ -65,6 +71,9 @@
"source": [
"dataset_name = \"synth_dataset_v2.json\"\n",
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
+ "\n",
+ "dataset = dataset[:300] # top 300 samples\n",
+ "\n",
"print(len(dataset))"
]
},
@@ -75,10 +84,12 @@
"metadata": {},
"outputs": [],
"source": [
- "entity_counter = Counter()\n",
- "for sample in dataset:\n",
- " for tag in sample.tags:\n",
- " entity_counter[tag] += 1"
+ "def get_entity_counts(dataset:List[InputSample]):\n",
+ " entity_counter = Counter()\n",
+ " for sample in dataset:\n",
+ " for tag in sample.tags:\n",
+ " entity_counter[tag] += 1\n",
+ " return entity_counter\n"
]
},
{
@@ -89,7 +100,7 @@
"outputs": [],
"source": [
"print(\"Count per entity:\")\n",
- "pprint(entity_counter.most_common())\n",
+ "pprint(get_entity_counts(dataset).most_common())\n",
"\n",
"print(\"\\nExample sentence:\")\n",
"print(dataset[1])\n",
@@ -109,52 +120,159 @@
},
{
"cell_type": "markdown",
- "id": "aae4c379",
+ "id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
"metadata": {},
"source": [
- "Run evaluation:"
+ "### Define the AnalyzerEngine object \n",
+ "In this case, using a huggingface model: obi/deid_roberta_i2b2"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "cf65af8f",
+ "id": "313b508f-e901-40b9-b575-c7fb8a794652",
"metadata": {},
"outputs": [],
"source": [
- "print(\"Evaluating Presidio Analyzer\")\n",
+ "from presidio_analyzer import AnalyzerEngine\n",
+ "from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
+ "\n",
+ "\n",
+ "# Here we define a transformers based NLP engine, \n",
+ "# but you can use this cell to customize your Presidio Analyzer instance\n",
+ "\n",
+ "# Define which model to use\n",
+ "model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
+ " \"spacy\": \"en_core_web_sm\", # use a small spaCy model for lemmas, tokens etc.\n",
+ " \"transformers\": \"obi/deid_roberta_i2b2\"\n",
+ " }\n",
+ "}]\n",
+ "\n",
+ "# Map transformers model labels to Presidio's\n",
+ "model_to_presidio_entity_mapping = dict(\n",
+ " PER=\"PERSON\",\n",
+ " PERSON=\"PERSON\",\n",
+ " LOC= \"LOCATION\",\n",
+ " LOCATION= \"LOCATION\",\n",
+ " GPE=\"LOCATION\",\n",
+ " ORG=\"ORGANIZATION\",\n",
+ " ORGANIZATION=\"ORGANIZATION\",\n",
+ " NORP=\"NRP\",\n",
+ " AGE=\"AGE\",\n",
+ " ID=\"ID\",\n",
+ " EMAIL=\"EMAIL\",\n",
+ " PATIENT=\"PERSON\",\n",
+ " STAFF=\"PERSON\",\n",
+ " HOSP=\"ORGANIZATION\",\n",
+ " PATORG=\"ORGANIZATION\",\n",
+ " DATE=\"DATE_TIME\",\n",
+ " TIME=\"DATE_TIME\",\n",
+ " PHONE=\"PHONE_NUMBER\",\n",
+ " HCW=\"PERSON\",\n",
+ " HOSPITAL=\"ORGANIZATION\",\n",
+ " FACILITY=\"LOCATION\",\n",
+ ")\n",
+ "\n",
+ "ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
+ " model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
+ "\n",
+ "nlp_engine = TransformersNlpEngine(models=model_config,\n",
+ " ner_model_configuration=ner_model_configuration)\n",
"\n",
+ "# Set up the engine, loads the NLP module (spaCy model by default) \n",
+ "# and other PII recognizers\n",
+ "analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aae4c379",
+ "metadata": {},
+ "source": [
+ "### Run evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16dbf6d6-a554-4602-8907-589786d47a12",
+ "metadata": {},
+ "source": [
+ "#### Define experiment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
+ "metadata": {},
+ "outputs": [],
+ "source": [
"experiment = get_experiment_tracker()\n",
- "model_name = \"Presidio Analyzer\"\n",
- "model = PresidioAnalyzerWrapper()\n",
+ "model = PresidioAnalyzerWrapper(analyzer_engine)\n",
+ "\n",
+ "# Define evaluator and experiment tracking\n",
"\n",
"evaluator = Evaluator(model=model)\n",
"dataset = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
")\n",
"\n",
- "evaluation_results = evaluator.evaluate_all(dataset)\n",
- "results = evaluator.calculate_score(evaluation_results)\n",
+ "print(\"Count per entity after alignment:\")\n",
+ "pprint(get_entity_counts(dataset).most_common())\n",
"\n",
- "# update params tracking\n",
- "params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
+ "# Track model and dataset params\n",
+ "params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
- "experiment.log_dataset_hash(dataset)\n",
+ "experiment.log_dataset_hash(dataset)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
+ "metadata": {},
+ "source": [
+ "#### Run experiment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf65af8f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run experiment\n",
+ "evaluation_results = evaluator.evaluate_all(dataset)\n",
+ "results = evaluator.calculate_score(evaluation_results)\n",
+ "\n",
+ "# Track experiment results\n",
"experiment.log_metrics(results.to_log())\n",
"entities, confmatrix = results.to_confusion_matrix()\n",
- "experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
- "\n",
- "print(\"Confusion matrix:\")\n",
- "print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
+ "experiment.log_confusion_matrix(matrix=confmatrix, \n",
+ " labels=entities)\n",
"\n",
- "print(\"Precision and recall\")\n",
- "print(results)\n",
+ "# Plot output\n",
+ "plotter = evaluator.Plotter(model=model, \n",
+ " results=results, \n",
+ " output_folder = \".\", \n",
+ " model_name = model.name, \n",
+ " beta = 2)\n",
"\n",
"# end experiment\n",
"experiment.end()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plotter.plot_scores()"
+ ]
+ },
{
"cell_type": "markdown",
"id": "070f8287",
@@ -198,7 +316,7 @@
"id": "98f4802e",
"metadata": {},
"source": [
- "1. Most false positive tokens:"
+ "1. Most common false positive tokens:"
]
},
{
@@ -219,7 +337,7 @@
"outputs": [],
"source": [
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
- "fps_df[[\"full_text\", \"token\", \"prediction\"]]"
+ "fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
]
},
{
@@ -227,7 +345,7 @@
"id": "d0852513",
"metadata": {},
"source": [
- "2. False negative examples"
+ "2. Most common false negative examples"
]
},
{
@@ -237,7 +355,7 @@
"metadata": {},
"outputs": [],
"source": [
- "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
+ "ModelError.most_common_fn_tokens(errors, n=50)"
]
},
{
@@ -255,7 +373,7 @@
"metadata": {},
"outputs": [],
"source": [
- "fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
+ "fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
]
},
{
@@ -278,13 +396,21 @@
"print(\"All errors:\\n\")\n",
"[print(error, \"\\n\") for error in errors]"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -296,7 +422,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/notebooks/5_Pseudonymization_demo.ipynb b/notebooks/5_Pseudonymization_demo.ipynb
index 4e2a14c..794a251 100644
--- a/notebooks/5_Pseudonymization_demo.ipynb
+++ b/notebooks/5_Pseudonymization_demo.ipynb
@@ -88,7 +88,7 @@
{
"data": {
"text/plain": [
- "[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
+ "[type: URL, start: 49, end: 69, score: 0.95,\n",
" type: PERSON, start: 14, end: 24, score: 0.85]"
]
},
@@ -116,11 +116,11 @@
{
"data": {
"text/plain": [
- "['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
- " 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
- " 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
- " 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
- " 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
+ "['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
+ " 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
+ " 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
+ " 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
+ " 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
]
},
"execution_count": 6,
@@ -153,11 +153,11 @@
"-------------\n",
"Fake examples:\n",
"\n",
- "Our son R2D2 used to work in Botswana\n",
- "Our son R2D2 used to work in American Samoa\n",
- "Our son R2D2 used to work in Malawi\n",
- "Our son R2D2 used to work in Montenegro\n",
- "our son r2d2 used to work in lebanon\n"
+ "Our son R2D2 used to work in Nigeria\n",
+ "Our son R2D2 used to work in Guam\n",
+ "Our son R2D2 used to work in Reunion\n",
+ "Our son R2D2 used to work in Vanuatu\n",
+ "Our son R2D2 used to work in Malaysia\n"
]
}
],
@@ -176,13 +176,20 @@
"print(f\"-------------\\nFake examples:\\n\")\n",
"print(*fake_samples, sep=\"\\n\")"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -194,9 +201,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
}
diff --git a/notebooks/models/Create datasets for Spacy training.ipynb b/notebooks/models/Create datasets for Spacy training.ipynb
index 37c43c1..56e8744 100644
--- a/notebooks/models/Create datasets for Spacy training.ipynb
+++ b/notebooks/models/Create datasets for Spacy training.ipynb
@@ -23,7 +23,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
@@ -42,55 +42,18 @@
},
"outputs": [],
"source": [
- "DATA_DATE = \"Dec-19-2021\""
+ "DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "tokenizing input: 0%| | 0/2122 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading model en_core_web_sm\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Read 2122 samples\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"data_path = \"../../data/{}_{}.json\"\n",
"\n",
@@ -111,17 +74,9 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Kept 1940 samples after removal of non-tagged samples\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
"print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
@@ -140,45 +95,13 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Entities found in training set:\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "{'ADDRESS',\n",
- " 'CREDIT_CARD',\n",
- " 'DATE_TIME',\n",
- " 'DOMAIN_NAME',\n",
- " 'EMAIL_ADDRESS',\n",
- " 'IBAN_CODE',\n",
- " 'IP_ADDRESS',\n",
- " 'LOCATION',\n",
- " 'O',\n",
- " 'ORGANIZATION',\n",
- " 'PERSON',\n",
- " 'PHONE_NUMBER',\n",
- " 'PREFIX',\n",
- " 'TITLE',\n",
- " 'US_SSN'}"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"print(\"Entities found in training set:\")\n",
"entities = []\n",
@@ -206,16 +129,7 @@
"name": "#%%\n"
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
- "Skipping illegal span None, text=U.N\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"spacy_train = InputSample.create_spacy_dataset(\n",
" dataset=train_tagged, output_path=\"train.spacy\"\n",
@@ -281,9 +195,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -295,9 +209,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
+ "nbformat_minor": 4
+}
diff --git a/notebooks/models/Evaluate CRF models.ipynb b/notebooks/models/Evaluate CRF models.ipynb
index fecf820..6e10f36 100644
--- a/notebooks/models/Evaluate CRF models.ipynb
+++ b/notebooks/models/Evaluate CRF models.ipynb
@@ -39,6 +39,16 @@
"%autoreload 2"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aee00770-a972-4a19-b423-1724214cc88c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install sklearn_crfsuite"
+ ]
+ },
{
"cell_type": "markdown",
"id": "a0d2d772",
@@ -58,8 +68,9 @@
},
"outputs": [],
"source": [
- "DATA_DATE = \"Jan-15-2022\"\n",
- "dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
+ "DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
+ "dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
+ "dataset = InputSample.read_dataset_json(dataset_name)\n",
"print(len(dataset))"
]
},
@@ -76,7 +87,7 @@
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
- " for t>ag in sample.tags:\n",
+ " for tag in sample.tags:\n",
" entity_counter[tag] += 1"
]
},
@@ -257,7 +268,7 @@
"metadata": {},
"outputs": [],
"source": [
- "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
+ "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
]
},
@@ -276,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
- "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
+ "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
]
},
{
@@ -325,13 +336,21 @@
"metadata": {},
"outputs": [],
"source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -343,9 +362,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/notebooks/models/Evaluate azure text analytics.ipynb b/notebooks/models/Evaluate azure text analytics.ipynb
index f7f122d..5e11369 100644
--- a/notebooks/models/Evaluate azure text analytics.ipynb
+++ b/notebooks/models/Evaluate azure text analytics.ipynb
@@ -205,7 +205,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.9.13 ('presidio')",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -219,9 +219,8 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.9.18"
},
- "orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
@@ -229,5 +228,5 @@
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/models/Evaluate flair models.ipynb b/notebooks/models/Evaluate flair models.ipynb
index 949906f..22b6d39 100644
--- a/notebooks/models/Evaluate flair models.ipynb
+++ b/notebooks/models/Evaluate flair models.ipynb
@@ -35,6 +35,16 @@
"%autoreload 2"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install flair"
+ ]
+ },
{
"cell_type": "markdown",
"id": "f036de59",
@@ -111,15 +121,14 @@
"metadata": {},
"outputs": [],
"source": [
- "flair_ner = \"ner-english\"\n",
- "flair_ner_fast = \"ner-english-fast\"\n",
- "flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
- "flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
+ "flair_ner = \"flair/ner-english\"\n",
+ "flair_ner_fast = \"flair/ner-english-fast\"\n",
+ "flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
+ "flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
"models = [\n",
" flair_ner,\n",
" flair_ner_fast,\n",
" flair_ontonotes_fast,\n",
- " flair_ner_fast,\n",
" flair_ontonotes_large,\n",
"]"
]
@@ -312,9 +321,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -326,7 +335,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/notebooks/models/Evaluate spacy models.ipynb b/notebooks/models/Evaluate spacy models.ipynb
index 1f69f59..d9a7047 100644
--- a/notebooks/models/Evaluate spacy models.ipynb
+++ b/notebooks/models/Evaluate spacy models.ipynb
@@ -109,7 +109,10 @@
"metadata": {},
"outputs": [],
"source": [
- "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
+ "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
+ "\n",
+ "# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
+ "#spacy.cli.download(\"en_core_web_trf\")"
]
},
{
@@ -334,9 +337,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "presidio",
+ "display_name": "presidio-evaluator",
"language": "python",
- "name": "presidio"
+ "name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@@ -348,9 +351,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.12"
+ "version": "3.9.18"
}
},
"nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
+ "nbformat_minor": 4
+}
diff --git a/presidio_evaluator/data_generator/presidio_data_generator.py b/presidio_evaluator/data_generator/presidio_data_generator.py
index 80633db..fe1a9c3 100644
--- a/presidio_evaluator/data_generator/presidio_data_generator.py
+++ b/presidio_evaluator/data_generator/presidio_data_generator.py
@@ -170,6 +170,8 @@ def add_provider_alias(self, provider_name: str, new_name: str) -> None:
new_provider = BaseProvider(self.faker)
setattr(new_provider, new_name, original)
+ setattr(new_provider, new_name.lower(), original) # avoid case sensitivity
+ setattr(new_provider, new_name.upper(), original) # avoid case sensitivity
self.faker.add_provider(new_provider)
@staticmethod
diff --git a/presidio_evaluator/data_generator/presidio_pseudonymize.py b/presidio_evaluator/data_generator/presidio_pseudonymize.py
index 9e859cf..334bce7 100644
--- a/presidio_evaluator/data_generator/presidio_pseudonymize.py
+++ b/presidio_evaluator/data_generator/presidio_pseudonymize.py
@@ -25,7 +25,7 @@ def __init__(self, map_to_presidio_entities: bool = True, **kwargs):
self.add_provider_alias("credit_card_number", "CREDIT_CARD")
self.add_provider_alias("iban", "IBAN_CODE")
self.add_provider_alias("phone_number", "PHONE_NUMBER")
- self.add_provider_alias("url", "DOMAIN_NAME")
+ self.add_provider_alias("url", "URL")
self.add_provider_alias("ssn", "US_SSN")
self.add_provider_alias("email", "EMAIL_ADDRESS")
self.add_provider_alias("date_time", "DATE_TIME")
diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index 6532657..0799198 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -1,6 +1,8 @@
+import copy
from collections import Counter
from typing import List, Optional, Dict
from pathlib import Path
+import string
import numpy as np
from tqdm import tqdm
@@ -39,7 +41,6 @@ def __init__(
self.entities_to_keep = self.model.entities
def compare(self, input_sample: InputSample, prediction: List[str]):
-
"""
Compares ground truth tags (annotation) and predicted (prediction)
:param input_sample: input sample containing list of tags with scheme
@@ -71,6 +72,9 @@ def compare(self, input_sample: InputSample, prediction: List[str]):
if self.entities_to_keep:
prediction = self._adjust_per_entities(prediction)
new_annotation = self._adjust_per_entities(new_annotation)
+
+ skip_words = self.get_skip_words()
+
for i in range(0, len(new_annotation)):
results[(new_annotation[i], prediction[i])] += 1
@@ -81,6 +85,10 @@ def compare(self, input_sample: InputSample, prediction: List[str]):
# check if there was an error
is_error = new_annotation[i] != prediction[i]
+ if str(tokens[i]).lower().strip() in skip_words:
+ is_error = False
+ results[(new_annotation[i], prediction[i])] -= 1
+
if is_error:
if prediction[i] == "O":
mistakes.append(
@@ -151,7 +159,6 @@ def evaluate_all(self, dataset: List[InputSample]) -> List[EvaluationResult]:
f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
)
for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
-
# Align tag values to the ones expected by the model
self.model.align_entity_types(sample)
@@ -345,13 +352,13 @@ def f_beta(precision: float, recall: float, beta: float) -> float:
if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
return np.nan
- return ((1 + beta ** 2) * precision * recall) / (
- ((beta ** 2) * precision) + recall
+ return ((1 + beta**2) * precision * recall) / (
+ ((beta**2) * precision) + recall
)
class Plotter:
"""
- Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
+ Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
for a PII detection model evaluated via Evaluator
:param model: Instance of a fitted model (of base type BaseModel)
@@ -362,7 +369,9 @@ class Plotter:
which gives more or less weight to precision vs. recall
"""
- def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
+ def __init__(
+ self, model, results, output_folder: Path, model_name: str, beta: float
+ ):
self.model = model
self.results = results
self.output_folder = output_folder
@@ -372,41 +381,66 @@ def __init__(self, model, results, output_folder: Path, model_name: str, beta: f
def plot_scores(self) -> None:
"""
- Plots per-entity recall, precision, or F2 score for evaluated model.
- :param plot_type: which metric to graph (default is F2 score)
+ Plots per-entity recall, precision, or F2 score for evaluated model.
"""
scores = {}
- scores['entity'] = list(self.results.entity_recall_dict.keys())
- scores['recall'] = list(self.results.entity_recall_dict.values())
- scores['precision'] = list(self.results.entity_precision_dict.values())
- scores['count'] = list(self.results.n_dict.values())
- scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
- for recall, precision in zip(scores['recall'], scores['precision'])]
+
+ entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
+ entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
+
+ scores["entity"] = list(entity_recall_dict.keys())
+ scores["recall"] = list(entity_recall_dict.values())
+ scores["precision"] = list(entity_precision_dict.values())
+ scores["count"] = list(self.results.n_dict.values())
+
+ scores[f"f{self.beta}_score"] = [
+ Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
+ for recall, precision in zip(scores["recall"], scores["precision"])
+ ]
+
+ # Add PII detection rates
+ scores["entity"].append("PII")
+ scores["recall"].append(self.results.pii_recall)
+ scores["precision"].append(self.results.pii_precision)
+ scores["count"].append(self.results.n)
+ scores[f"f{self.beta}_score"].append(self.results.pii_f)
+
df = pd.DataFrame(scores)
- df['model'] = self.model_name
+ df["model"] = self.model_name
self._plot(df, plot_type="f2_score")
self._plot(df, plot_type="precision")
self._plot(df, plot_type="recall")
def _plot(self, df, plot_type) -> None:
- fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
- x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
- fig.update_layout(barmode='group', yaxis={
- 'categoryorder': 'total ascending'})
+ fig = px.bar(
+ df,
+ text_auto=".2",
+ y="entity",
+ orientation="h",
+ x=plot_type,
+ color="count",
+ barmode="group",
+ height=30*len(set(df["entity"])),
+ title=f"Per-entity {plot_type} for {self.model_name}",
+ )
+ fig.update_layout(
+ barmode="group", yaxis={"categoryorder": "total ascending"}
+ )
fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
- fig.update_traces(textfont_size=12, textangle=0,
- textposition="outside", cliponaxis=False)
+ fig.update_traces(
+ textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
+ )
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="PII entity",
linecolor="#BCCCDC", # Sets color of X-axis line
- showgrid=False # Removes X-axis grid lines
+ showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"{plot_type}",
linecolor="#BCCCDC", # Sets color of X-axis line
- showgrid=False # Removes X-axis grid lines
+ showgrid=False, # Removes X-axis grid lines
),
)
fig.show()
@@ -419,47 +453,100 @@ def plot_most_common_tokens(self) -> None:
for entity in self.model.entity_mapping.values():
fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
if fps_df is not None:
- fps_path = self.output_folder / \
- f"{self.model_name}-{entity}-fps.csv"
+ fps_path = (
+ self.output_folder / f"{self.model_name}-{entity}-fps.csv"
+ )
fps_df.to_csv(fps_path)
fps_frames.append(fps_path)
fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
if fns_df is not None:
- fns_path = self.output_folder / \
- f"{self.model_name}-{entity}-fns.csv"
+ fns_path = (
+ self.output_folder / f"{self.model_name}-{entity}-fns.csv"
+ )
fns_df.to_csv(fns_path)
fns_frames.append(fns_path)
def group_tokens(df):
- return df.groupby(['token', 'annotation']).size().to_frame(
- ).sort_values([0], ascending=False).head(3).reset_index()
+ return (
+ df.groupby(["token", "annotation"])
+ .size()
+ .to_frame()
+ .sort_values([0], ascending=False)
+ .head(3)
+ .reset_index()
+ )
fps_tokens_df = pd.concat(
- [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
+ [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
+ )
fns_tokens_df = pd.concat(
- [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
+ [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
+ )
def generate_graph(title, tokens_df):
- fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
- title=f"Most common {title} for {self.model_name}")
+ fig = px.histogram(
+ tokens_df,
+ x=0,
+ y="token",
+ orientation="h",
+ color="annotation",
+ title=f"Most common {title} for {self.model_name}",
+ )
fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
- fig.update_traces(textfont_size=12, textangle=0,
- textposition="outside", cliponaxis=False)
+ fig.update_traces(
+ textfont_size=12,
+ textangle=0,
+ textposition="outside",
+ cliponaxis=False,
+ )
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="Count",
linecolor="#BCCCDC", # Sets color of X-axis line
- showgrid=False # Removes X-axis grid lines
+ showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"Tokens",
linecolor="#BCCCDC", # Sets color of X-axis line
- showgrid=False # Removes X-axis grid lines
+ showgrid=False, # Removes X-axis grid lines
),
)
- fig.update_layout(yaxis={'categoryorder': 'total ascending'})
+ fig.update_layout(yaxis={"categoryorder": "total ascending"})
fig.show()
+
generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
generate_graph(title="false-positives", tokens_df=fps_tokens_df)
+
+ @staticmethod
+ def get_skip_words():
+ skip_words = [x for x in string.punctuation]
+ skip_words.extend(
+ [
+ "\n",
+ "\n\n",
+ "\n\n\n",
+ ">>",
+ ">>>",
+ ">>>>",
+ "street",
+ "st.",
+ "st",
+ "de",
+ "rue",
+ "via",
+ "and",
+ "or",
+ "do",
+ "as",
+ "of",
+ "day",
+ "address",
+ "country",
+ "state",
+ "city",
+ ]
+ )
+
+ return skip_words
diff --git a/presidio_evaluator/models/base_model.py b/presidio_evaluator/models/base_model.py
index bd07658..6f27d96 100644
--- a/presidio_evaluator/models/base_model.py
+++ b/presidio_evaluator/models/base_model.py
@@ -31,6 +31,7 @@ def __init__(
self.labeling_scheme = labeling_scheme
self.entity_mapping = entity_mapping
self.verbose = verbose
+ self.name = self.__class__.__name__
@abstractmethod
def predict(self, sample: InputSample, **kwargs) -> List[str]:
diff --git a/presidio_evaluator/models/crf_model.py b/presidio_evaluator/models/crf_model.py
index 5a25011..5a4462c 100644
--- a/presidio_evaluator/models/crf_model.py
+++ b/presidio_evaluator/models/crf_model.py
@@ -85,7 +85,7 @@ def _to_feature_set(self, dataset: List[InputSample]):
y_train = [self.sent2labels(s) for s in sentences]
return X_train, y_train
- def predict(self, sample: InputSample) -> List[str]:
+ def predict(self, sample: InputSample, **kwargs) -> List[str]:
tags = CRFModel.crf_predict(sample, self.model)
if len(tags) != len(sample.tokens):
diff --git a/presidio_evaluator/models/flair_model.py b/presidio_evaluator/models/flair_model.py
index bf25b4e..da382a3 100644
--- a/presidio_evaluator/models/flair_model.py
+++ b/presidio_evaluator/models/flair_model.py
@@ -48,7 +48,7 @@ def __init__(
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))
- def predict(self, sample: InputSample) -> List[str]:
+ def predict(self, sample: InputSample, **kwargs) -> List[str]:
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
self.model.predict(sentence)
diff --git a/presidio_evaluator/models/presidio_analyzer_wrapper.py b/presidio_evaluator/models/presidio_analyzer_wrapper.py
index 9598b2b..6372ad2 100644
--- a/presidio_evaluator/models/presidio_analyzer_wrapper.py
+++ b/presidio_evaluator/models/presidio_analyzer_wrapper.py
@@ -91,23 +91,28 @@ def predict(self, sample: InputSample, **kwargs) -> List[str]:
"PHONE_NUMBER": "PHONE_NUMBER",
"BIRTHDAY": "DATE_TIME",
"DATE_TIME": "DATE_TIME",
- "DOMAIN_NAME": "DOMAIN_NAME",
+ "DOMAIN_NAME": "URL",
+ "TIME" : "DATE_TIME",
+ "DATE" : "DATE_TIME",
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
+ "STREET_ADDRESS": "LOCATION",
"NATIONALITY": "LOCATION",
"LOCATION": "LOCATION",
"IBAN_CODE": "IBAN_CODE",
- "URL": "DOMAIN_NAME",
+ "URL": "URL",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
- "ORGANIZATION": "ORG",
+ "ORGANIZATION": "ORGANIZATION",
+ "ORG": "ORGANIZATION",
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
- "NRP": "NRP",
- "TITLE": "O", # not supported
- "PREFIX": "O", # not supported
- "STREET_ADDRESS": "O", # not supported
- "ZIP_CODE": "O", # not supported
- "AGE": "O", # not supported
+ "NRP": "LOCATION",
+ "NORP": "LOCATION",
+ "ID": "ID",
+ "TITLE": "O", # not supported through spaCy
+ "PREFIX": "O", # not supported through spaCy
+ "ZIP_CODE": "O", # not supported through spaCy
+ "AGE": "O", # not supported through spaCy
"O": "O",
}
diff --git a/presidio_evaluator/models/spacy_model.py b/presidio_evaluator/models/spacy_model.py
index e919ccf..0ed30ea 100644
--- a/presidio_evaluator/models/spacy_model.py
+++ b/presidio_evaluator/models/spacy_model.py
@@ -31,7 +31,7 @@ def __init__(
else:
self.model = model
- def predict(self, sample: InputSample) -> List[str]:
+ def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict a list of tags for an inpuit sample.
:param sample: InputSample
diff --git a/presidio_evaluator/models/stanza_model.py b/presidio_evaluator/models/stanza_model.py
index 9dd6a01..2d0d1cb 100644
--- a/presidio_evaluator/models/stanza_model.py
+++ b/presidio_evaluator/models/stanza_model.py
@@ -51,7 +51,7 @@ def __init__(
entity_mapping=entity_mapping,
)
- def predict(self, sample: InputSample) -> List[str]:
+ def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict the tags using a stanza model.
diff --git a/presidio_evaluator/models/text_analytics_wrapper.py b/presidio_evaluator/models/text_analytics_wrapper.py
index b353c13..42fd308 100644
--- a/presidio_evaluator/models/text_analytics_wrapper.py
+++ b/presidio_evaluator/models/text_analytics_wrapper.py
@@ -48,8 +48,7 @@ def __authenticate_client(self, key: str, endpoint: str):
)
return text_analytics_client
-
- def predict(self, sample: InputSample) -> List[str]:
+ def predict(self, sample: InputSample, **kwargs) -> List[str]:
documents = [sample.full_text]
response = self.ta_client.recognize_pii_entities(documents,
language="en")
diff --git a/pyproject.toml b/pyproject.toml
index 94beda4..a0ec05e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,26 +4,23 @@ version = "0.1.0"
description = ""
authors = ["Omri Mendels "]
readme = "README.md"
+include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]
[tool.poetry.dependencies]
python = "^3.9"
-spacy = ">=3.2.0, <4.0.0"
-numpy = ">=1.20.2,<2.0.0"
-jupyter = ">=1"
-pandas = ">=1.2.4,<2.0.0"
-tqdm = ">=4.60.0,<5.0.0"
-haikunator = ">=2.1.0,<3.0.0"
-schwifty = ">=2023.11.2,<2024.0.0"
-faker = ">=9.6.0,<10.0.0"
-scikit-learn = ">1.3.2,<2.0.0"
-pytest = ">=6.2.3"
+spacy = "^3.5.0"
+numpy = "^1.22"
+pandas = "^2.1.4"
+tqdm = "^4.60.0"
+faker = "^21.0"
+scikit-learn = "^1.3.2"
presidio-analyzer = "^2.2.351"
presidio-anonymizer = "^2.2.351"
-requests = ">=2.25.1"
-xmltodict = ">=0.12.0"
+requests = "^2.25"
+xmltodict = "^0.12.0"
python-dotenv = "^1.0.0"
plotly = "^5.18.0"
-azure-ai-textanalytics = ">=5.3.0"
+azure-ai-textanalytics = "^5.3.0"
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}
diff --git a/setup.py b/setup.py
index e7005b0..4566045 100644
--- a/setup.py
+++ b/setup.py
@@ -1,54 +1,57 @@
-from setuptools import setup, find_packages
-import os.path
-
-# read the contents of the README file
+# -*- coding: utf-8 -*-
+from setuptools import setup
+import os
from os import path
this_directory = path.abspath(path.dirname(__file__))
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
- # print(long_description)
with open(os.path.join(this_directory, "VERSION")) as version_file:
- __version__ = version_file.read().strip()
+ version = version_file.read().strip()
+
+
+packages = [
+ "presidio_evaluator",
+ "presidio_evaluator.data_generator",
+ "presidio_evaluator.data_generator.faker_extensions",
+ "presidio_evaluator.dataset_formatters",
+ "presidio_evaluator.evaluation",
+ "presidio_evaluator.experiment_tracking",
+ "presidio_evaluator.models",
+]
+
+package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
+
+install_requires = [
+ "azure-ai-textanalytics>=5.3.0,<6.0.0",
+ "en_core_web_lg @ "
+ "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz",
+ "en_core_web_sm @ "
+ "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz",
+ "faker>=21.0,<22.0",
+ "numpy>=1.22,<2.0",
+ "pandas>=2.1.4,<3.0.0",
+ "plotly>=5.18.0,<6.0.0",
+ "presidio-analyzer>=2.2.351,<3.0.0",
+ "presidio-anonymizer>=2.2.351,<3.0.0",
+ "python-dotenv>=1.0.0,<2.0.0",
+ "requests>=2.25,<3.0",
+ "scikit-learn>=1.3.2,<2.0.0",
+ "spacy>=3.5.0,<4.0.0",
+ "tqdm>=4.60.0,<5.0.0",
+ "xmltodict>=0.12.0,<0.13.0",
+]
setup(
name="presidio-evaluator",
long_description=long_description,
long_description_content_type="text/markdown",
- version=__version__,
- packages=find_packages(exclude=["tests"]),
url="https://www.github.com/microsoft/presidio-research",
+ version=version,
license="MIT",
- description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa
- data_files=[
- (
- "presidio_evaluator/data_generator/raw_data",
- [
- "presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa
- "presidio_evaluator/data_generator/raw_data/templates.txt",
- "presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
- "presidio_evaluator/data_generator/raw_data/nationalities.csv",
- "presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
- ],
- )
- ],
- include_package_data=True,
- install_requires=[
- "presidio_analyzer",
- "presidio_anonymizer",
- "spacy>=3.0.0",
- "requests",
- "numpy",
- "pandas",
- "tqdm>=4.32.1",
- "jupyter>=1.0.0",
- "pytest>=4.6.2",
- "haikunator",
- "schwifty",
- "faker",
- "sklearn_crfsuite",
- "python-dotenv",
- "azure-ai-textanalytics==5.2.0"
- ],
-)
+ packages=packages,
+ package_data=package_data,
+ install_requires=install_requires,
+ python_requires=">=3.8,<4.0",
+)
\ No newline at end of file
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 8319e05..a5e4cec 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
Evaluator.align_entity_types(
input_samples=[sample1], entities_mapping=entities_mapping
)
+
+
+def test_skip_words_are_not_counted_as_errors():
+ prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
+ model = MockTokensModel(prediction=prediction,
+ entities_to_keep=["LOCATION", "PERSON"])
+
+ evaluator = Evaluator(model=model)
+ sample = InputSample(
+ full_text="John is on the street", masked="I am the street", spans=None
+ )
+ sample.tokens = ["John", "is", "on", "the", "street"]
+ sample.tags = ["U-PERSON", "O", "O", "O", "O"]
+
+ evaluated = evaluator.evaluate_sample(sample, prediction)
+ final_evaluation = evaluator.calculate_score([evaluated])
+
+ assert final_evaluation.pii_precision == 1
+ assert final_evaluation.pii_recall == 1
diff --git a/tests/test_presidio_pseudonymize.py b/tests/test_presidio_pseudonymize.py
index a6d28ed..a756548 100644
--- a/tests/test_presidio_pseudonymize.py
+++ b/tests/test_presidio_pseudonymize.py
@@ -30,7 +30,7 @@ def fake_faker():
],
# fmt: on
)
-def test_presidio_psudonymize_two_entities(
+def test_presidio_pseudonymize_two_entities(
text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
):
@@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
assert value2 in pseudonym
assert text[:start1].lower() in pseudonym.lower()
assert text[end1:start2].lower() in pseudonym.lower()
+
+
+def test_simple_scenario():
+ original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
+ presidio_response = [
+ RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
+ RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
+ ]
+
+ PresidioPseudonymization().pseudonymize(original_text=original_text,
+ presidio_response=presidio_response,
+ count=5)