diff --git a/docs/development.md b/docs/development.md index 72e784c5f..73851bdae 100644 --- a/docs/development.md +++ b/docs/development.md @@ -179,3 +179,35 @@ Edit [charts/presidio/values.yaml](../charts/presidio/values.yaml) to: - Setup secret name (for private registries) - Change presidio services version - Change default scale + + +## NLP Engine Configuration + +1. The nlp engines deployed are set on start up based on the yaml configuration files in `presidio-analyzer/conf/`. The default nlp engine is the large English SpaCy model (`en_core_web_lg`) set in `default.yaml`. + +2. The format of the yaml file is as follows: + +```yaml +nlp_engine_name: spacy # {spacy, stanza} +models: + - + lang_code: en # code corresponds to `supported_language` in any custom recognizers + model_name: en_core_web_lg # the name of the SpaCy or Stanza model + - + lang_code: de # more than one model is optional, just add more items + model_name: de +``` + +3. By default, we call the method `load_predefined_recognizers` of the `RecognizerRegistry` class to load language specific and language agnostic recognizers. + +4. Downloading additional engines. + * SpaCy NLP Models: [models download page](https://spacy.io/usage/models) + * Stanza NLP Models: [models download page](https://stanfordnlp.github.io/stanza/available_models.html) + + ```sh + # download models - tldr + # spacy + python -m spacy download en_core_web_lg + # stanza + python -c 'import stanza; stanza.download("en");' + ``` diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index b9ef47fb2..68f0f0f75 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -1,6 +1,6 @@ nlp_engine_name: spacy models: - - name: en - lang: en_core_web_lg + lang_code: en + model_name: en_core_web_lg diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 6bcea90cf..131db7931 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -1,5 +1,5 @@ nlp_engine_name: spacy models: - - lang: en - name: en + lang_code: en + model_name: en_core_web_sm diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index 12de15efa..89908e9b0 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -1,8 +1,8 @@ nlp_engine_name: spacy models: - - name: en - lang: en + lang_code: en + model_name: en - - name: de - lang: de + lang_code: de + model_name: de diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index 2b22e9484..7d8090e4a 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -1,6 +1,6 @@ nlp_engine_name: stanza models: - - lang: en - name: en + lang_code: en + model_name: en diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index 32cfd39c9..d0e02e39c 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -1,9 +1,9 @@ nlp_engine_name: stanza models: - - lang: en - name: en + lang_code: en + model_name: en - - lang: de - name: de + lang_code: de + model_name: de diff --git a/presidio-analyzer/presidio_analyzer/app.py b/presidio-analyzer/presidio_analyzer/app.py index 384da6747..d1d0f7d17 100644 --- a/presidio-analyzer/presidio_analyzer/app.py +++ b/presidio-analyzer/presidio_analyzer/app.py @@ -65,6 +65,15 @@ def serve_command_handler( nlp_conf_path="conf/default.yaml", max_workers=10, ): + """ + :param enable_trace_pii: boolean to enable trace pii + :param env_grpc_port: boolean to use environmental variables + for grpc ports (default: False) + :param grpc_port: port for grpc server (default: 3000) + :param nlp_conf_path: str to path of nlp engine configuration + (default: 'conf/default.yaml') + :param max_workers: int for number of workers of grpc server (default: 10) + """ logger.info("Starting GRPC server") server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) logger.info("GRPC started") @@ -79,11 +88,11 @@ def serve_command_handler( ) nlp_conf = { "nlp_engine_name": "spacy", - "models": [{"lang": "en", "name": "en_core_web_lg"}], + "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], } nlp_engine_name = nlp_conf["nlp_engine_name"] nlp_engine_class = NLP_ENGINES[nlp_engine_name] - nlp_engine_opts = {m["lang"]: m["name"] for m in nlp_conf["models"]} + nlp_engine_opts = {m["lang_code"]: m["model_name"] for m in nlp_conf["models"]} nlp_engine = nlp_engine_class(nlp_engine_opts) logger.info(f"{nlp_engine_class.__name__} created") diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 2d40e711f..bb8fa91fd 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -18,12 +18,12 @@ class SpacyNlpEngine(NlpEngine): def __init__(self, models=None): if not models: - models = {"en": "en"} - logger.debug(f"Loading NLP models: {models.values()}") + models = {"en": "en_core_web_lg"} + logger.debug(f"Loading SpaCy models: {models.values()}") self.nlp = { - lang: spacy.load(model_name, disable=['parser', 'tagger']) - for lang, model_name in models.items() + lang_code: spacy.load(model_name, disable=['parser', 'tagger']) + for lang_code, model_name in models.items() } for model_name in models.values(): diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 6e5157973..fb756ed39 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -24,14 +24,14 @@ class StanzaNlpEngine(SpacyNlpEngine): def __init__(self, models=None): if not models: models = {"en": "en"} - logger.debug(f"Loading NLP models: {models.values()}") + logger.debug(f"Loading Stanza models: {models.values()}") self.nlp = { - lang: StanzaLanguage( + lang_code: StanzaLanguage( stanza.Pipeline( model_name, - processors="tokenize,mwt,pos,lemma,ner", + processors="tokenize,pos,lemma,ner", ) ) - for lang, model_name in models.items() + for lang_code, model_name in models.items() } diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py index 9d654a90a..7c7eab72f 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py @@ -38,6 +38,11 @@ def __init__( supported_entity="CREDIT_CARD", replacement_pairs=None, ): + """ + :param replacement_pairs: list of tuples to replace in the string. + ( default: [("-", ""), (" ", "")] ) + i.e. remove dashes and spaces from the string during recognition. + """ self.replacement_pairs = replacement_pairs \ if replacement_pairs \ else [("-", ""), (" ", "")]