Skip to content

Commit

Permalink
Remove output in notebook. Update class
Browse files Browse the repository at this point in the history
  • Loading branch information
tranguyen221 committed Nov 28, 2022
1 parent 810f259 commit 6b50167
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 125 deletions.
154 changes: 33 additions & 121 deletions notebooks/models/Evaluate azure text analytics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -58,14 +58,28 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|██████████| 1500/1500 [00:09<00:00, 153.03it/s]"
"tokenizing input: 0%| | 0/1500 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|██████████| 1500/1500 [00:12<00:00, 119.46it/s]"
]
},
{
Expand Down Expand Up @@ -110,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -181,7 +195,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -194,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 33,
"metadata": {},
"outputs": [
{
Expand All @@ -208,14 +222,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1500/1500 [01:36<00:00, 15.61it/s]"
"Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1/1 [00:00<00:00, 4.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"saving experiment data to experiment_20221125-162355.json\n"
"saving experiment data to experiment_20221128-094558.json\n"
]
},
{
Expand Down Expand Up @@ -251,7 +265,8 @@
" \"IBAN_CODE\":\"InternationalBankingAccountNumber\", \n",
" \"US_DRIVER_LICENSE\":\"USDriversLicenseNumber\"\n",
" }\n",
"evaluator = Evaluator(model=model)\n",
"# List of entity names to focus the evaluator on (and ignore the rest) is defined with entities_to_keep parameter\n",
"evaluator = Evaluator(model=model, entities_to_keep=[\"Person\", \"Address\"])\n",
"dataset_ = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=i2b2_entities_to_text_analytics\n",
")\n",
Expand All @@ -274,109 +289,17 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion matrix:\n",
" Address Age CreditCardNumber DateTime \\\n",
"Address 1522 0 0 9 \n",
"Age 0 0 0 0 \n",
"CreditCardNumber 0 0 70 0 \n",
"DateTime 0 0 0 219 \n",
"Email 0 0 0 0 \n",
"IPAddress 0 0 0 0 \n",
"InternationalBankingAccountNumber 0 0 0 0 \n",
"O 110 0 0 395 \n",
"Organization 1 0 0 0 \n",
"Person 0 0 0 0 \n",
"PhoneNumber 0 0 0 3 \n",
"URL 0 0 0 0 \n",
"USDriversLicenseNumber 0 0 0 0 \n",
"USSocialSecurityNumber 0 0 0 0 \n",
"\n",
" Email IPAddress \\\n",
"Address 0 0 \n",
"Age 0 0 \n",
"CreditCardNumber 0 0 \n",
"DateTime 0 0 \n",
"Email 28 0 \n",
"IPAddress 0 22 \n",
"InternationalBankingAccountNumber 0 0 \n",
"O 0 0 \n",
"Organization 0 0 \n",
"Person 0 0 \n",
"PhoneNumber 0 2 \n",
"URL 0 0 \n",
"USDriversLicenseNumber 0 0 \n",
"USSocialSecurityNumber 0 0 \n",
"\n",
" InternationalBankingAccountNumber O \\\n",
"Address 0 1406 \n",
"Age 0 42 \n",
"CreditCardNumber 0 38 \n",
"DateTime 0 0 \n",
"Email 0 0 \n",
"IPAddress 0 0 \n",
"InternationalBankingAccountNumber 21 0 \n",
"O 0 19331 \n",
"Organization 0 58 \n",
"Person 0 24 \n",
"PhoneNumber 0 67 \n",
"URL 0 0 \n",
"USDriversLicenseNumber 0 2 \n",
"USSocialSecurityNumber 0 0 \n",
"\n",
" Organization Person PhoneNumber URL \\\n",
"Address 47 72 14 0 \n",
"Age 0 0 0 0 \n",
"CreditCardNumber 0 0 10 0 \n",
"DateTime 0 0 0 0 \n",
"Email 0 21 0 0 \n",
"IPAddress 0 0 0 0 \n",
"InternationalBankingAccountNumber 0 0 0 0 \n",
"O 136 39 9 0 \n",
"Organization 391 54 0 0 \n",
"Person 4 1340 0 0 \n",
"PhoneNumber 0 0 278 0 \n",
"URL 0 0 0 37 \n",
"USDriversLicenseNumber 0 0 7 0 \n",
"USSocialSecurityNumber 0 0 0 0 \n",
"\n",
" USDriversLicenseNumber \\\n",
"Address 0 \n",
"Age 0 \n",
"CreditCardNumber 0 \n",
"DateTime 0 \n",
"Email 0 \n",
"IPAddress 0 \n",
"InternationalBankingAccountNumber 0 \n",
"O 0 \n",
"Organization 0 \n",
"Person 0 \n",
"PhoneNumber 0 \n",
"URL 0 \n",
"USDriversLicenseNumber 0 \n",
"USSocialSecurityNumber 0 \n",
"\n",
" USSocialSecurityNumber \n",
"Address 0 \n",
"Age 0 \n",
"CreditCardNumber 0 \n",
"DateTime 0 \n",
"Email 0 \n",
"IPAddress 0 \n",
"InternationalBankingAccountNumber 0 \n",
"O 0 \n",
"Organization 0 \n",
"Person 0 \n",
"PhoneNumber 0 \n",
"URL 0 \n",
"USDriversLicenseNumber 0 \n",
"USSocialSecurityNumber 80 \n"
" Address O\n",
"Address 6 8\n",
"O 0 5\n"
]
}
],
Expand All @@ -387,7 +310,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand All @@ -396,21 +319,10 @@
"text": [
"Precision and recall\n",
" Entity Precision Recall Number of samples\n",
" Person 87.81% 97.88% 1369\n",
" Age nan% 0.00% 74\n",
" Email 100.00% 57.14% 49\n",
" URL 100.00% 100.00% 37\n",
"InternationalBankingAccountNumber 100.00% 100.00% 21\n",
" Organization 67.65% 77.58% 504\n",
" Address 93.20% 49.56% 3071\n",
"USSocialSecurityNumber 100.00% 100.00% 80\n",
" CreditCardNumber 100.00% 51.47% 136\n",
" IPAddress 91.67% 100.00% 22\n",
" DateTime 34.98% 100.00% 219\n",
"USDriversLicenseNumber nan% 0.00% 9\n",
" PhoneNumber 87.42% 79.43% 350\n",
" PII 80.19% 72.45% 5941\n",
"PII F measure: 73.42%\n"
" Address 100.00% 42.86% 14\n",
" Organization nan% 0.00% 1\n",
" PII 100.00% 40.00% 15\n",
"PII F measure: 43.61%\n"
]
}
],
Expand Down
4 changes: 0 additions & 4 deletions presidio_evaluator/models/text_analytics_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def __init__(
ta_client: Optional[TextAnalyticsClient] = None,
ta_key: Optional[str] = "",
ta_endpoint: Optional[str] = "",
entities_to_keep: List[str] = None,
verbose: bool = False,
labeling_scheme: str = "BIO",
score_threshold: float = 0.4,
Expand All @@ -24,11 +23,9 @@ def __init__(
:param ta_client: object of type TextAnalyticsClient
:param ta_key: Azure cognitive Services for Language key
:param ta_endpoint: Azure cognitive Services for Language endpoint
:param entities_to_keep: List of entities to predict on
:param entity_mapping: Mapping between input dataset entities and entities expected by Azure cognitive Services for Language
"""
super().__init__(
entities_to_keep=entities_to_keep,
verbose=verbose,
labeling_scheme=labeling_scheme,
entity_mapping=entity_mapping,
Expand All @@ -40,7 +37,6 @@ def __init__(

if not ta_client:
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
#self._update_recognizers_based_on_entities_to_keep(ta_client)
self.ta_client = ta_client


Expand Down

0 comments on commit 6b50167

Please sign in to comment.