-
Notifications
You must be signed in to change notification settings - Fork 567
/
azure_ai_language.py
163 lines (140 loc) · 6.16 KB
/
azure_ai_language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import logging
import os
from typing import List, Optional
try:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
except ImportError:
TextAnalyticsClient = None
AzureKeyCredential = None
from presidio_analyzer import AnalysisExplanation, RecognizerResult, RemoteRecognizer
from presidio_analyzer.nlp_engine import NlpArtifacts
logger = logging.getLogger("presidio-analyzer")
class AzureAILanguageRecognizer(RemoteRecognizer):
"""Wrapper for PII detection using Azure AI Language."""
def __init__(
self,
supported_entities: Optional[List[str]] = None,
supported_language: str = "en",
ta_client: Optional["TextAnalyticsClient"] = None,
azure_ai_key: Optional[str] = None,
azure_ai_endpoint: Optional[str] = None,
**kwargs
):
"""
Wrap the PII detection in Azure AI Language.
:param supported_entities: List of supported entities for this recognizer.
If None, all supported entities will be used.
:param supported_language: Language code to use for the recognizer.
:param ta_client: object of type TextAnalyticsClient. If missing,
the client will be created using the key and endpoint.
:param azure_ai_key: Azure AI for language key
:param azure_ai_endpoint: Azure AI for language endpoint
:param kwargs: Additional arguments required by the parent class
For more info, see https://learn.microsoft.com/en-us/azure/ai-services/language-service/personally-identifiable-information/overview
""" # noqa E501
super().__init__(
supported_entities=supported_entities,
supported_language=supported_language,
name="Azure AI Language PII",
version="5.2.0",
**kwargs
)
is_available = bool(TextAnalyticsClient)
if not ta_client and not is_available:
raise ValueError(
"Azure AI Language is not available. "
"Please install the required dependencies:"
"1. azure-ai-textanalytics"
"2. azure-core"
)
if not supported_entities:
self.supported_entities = self.__get_azure_ai_supported_entities()
if not ta_client:
ta_client = self.__authenticate_client(azure_ai_key, azure_ai_endpoint)
self.ta_client = ta_client
def get_supported_entities(self) -> List[str]:
"""
Return the list of entities this recognizer can identify.
:return: A list of the supported entities by this recognizer
"""
return self.supported_entities
@staticmethod
def __get_azure_ai_supported_entities() -> List[str]:
"""Return the list of all supported entities for Azure AI Language."""
from azure.ai.textanalytics._models import PiiEntityCategory # noqa
return [r.value.upper() for r in PiiEntityCategory]
@staticmethod
def __authenticate_client(key: str, endpoint: str) -> TextAnalyticsClient:
"""Authenticate the client using the key and endpoint.
:param key: Azure AI Language key
:param endpoint: Azure AI Language endpoint
"""
key = key if key else os.getenv("AZURE_AI_KEY", None)
endpoint = endpoint if endpoint else os.getenv("AZURE_AI_ENDPOINT", None)
if key is None:
raise ValueError(
"Azure AI Language key is required. "
"Please provide a key or set the AZURE_AI_KEY environment variable."
)
if endpoint is None:
raise ValueError(
"Azure AI Language endpoint is required. "
"Please provide an endpoint "
"or set the AZURE_AI_ENDPOINT environment variable."
)
ta_credential = AzureKeyCredential(key)
text_analytics_client = TextAnalyticsClient(
endpoint=endpoint, credential=ta_credential
)
return text_analytics_client
def analyze(
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
) -> List[RecognizerResult]:
"""
Analyze text using Azure AI Language.
:param text: Text to analyze
:param entities: List of entities to return
:param nlp_artifacts: Object of type NlpArtifacts, not used in this recognizer.
:return: A list of RecognizerResult, one per each entity found in the text.
"""
if not entities:
entities = self.supported_entities
response = self.ta_client.recognize_pii_entities(
[text], language=self.supported_language
)
results = [doc for doc in response if not doc.is_error]
recognizer_results = []
for res in results:
for entity in res.entities:
entity.category = entity.category.upper()
if entity.category.lower() not in [
ent.lower() for ent in self.supported_entities
]:
continue
if entity.category.lower() not in [ent.lower() for ent in entities]:
continue
analysis_explanation = AzureAILanguageRecognizer._build_explanation(
original_score=entity.confidence_score,
entity_type=entity.category,
)
recognizer_results.append(
RecognizerResult(
entity_type=entity.category,
start=entity.offset,
end=entity.offset + entity.length,
score=entity.confidence_score,
analysis_explanation=analysis_explanation,
)
)
return recognizer_results
@staticmethod
def _build_explanation(
original_score: float, entity_type: str
) -> AnalysisExplanation:
explanation = AnalysisExplanation(
recognizer=AzureAILanguageRecognizer.__class__.__name__,
original_score=original_score,
textual_explanation=f"Identified as {entity_type} by Azure AI Language",
)
return explanation