updates to fake name generator

airobotproject · Jan 15, 2022 · c9151c6 · c9151c6
1 parent f8d08bf
commit c9151c6
Show file tree

Hide file tree

Showing 4 changed files with 3,016 additions and 3,014 deletions.
diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb
@@ -11,7 +11,7 @@
     "import datetime\n",
     "import pprint\n",
     "from collections import Counter\n",
-    "\n",
+    "import tqdm\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
@@ -162,7 +162,8 @@
     "fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
     "\n",
     "# Update to match existing templates\n",
-    "PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)"
+    "fake_name_generator_df = PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)\n",
+    "fake_name_generator_df.head()"
    ]
   },
   {
@@ -422,7 +423,7 @@
     "%%time\n",
     "input_samples = [\n",
     "    InputSample.from_faker_spans_result(faker_spans_result=fake_record)\n",
-    "    for fake_record in fake_records\n",
+    "    for fake_record in tqdm.tqdm(fake_records)\n",
     "]"
    ]
   },
@@ -493,9 +494,9 @@
    "source": [
     "### Next steps\n",
     "\n",
-    "- Evaluate Presidio using this fake data. [Sample](\"4_Evaluate_Presidio_Analyzer.ipynb\")\n",
-    "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](\"3_Split_by_pattern_#.ipynb\")\n",
-    "- Conduct a small exploratory data analysis on the generated data. [Sample](\"2_PII_EDA.ipynb\")"
+    "- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
+    "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
+    "- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
    ]
   },
   {
@@ -543,4 +544,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/presidio_evaluator/data_generator/faker_extensions/record_generator.py b/presidio_evaluator/data_generator/faker_extensions/record_generator.py
@@ -84,9 +84,8 @@ def _get_random_record(self):
     def _match_to_span(self, text: str, **kwargs) -> List[FakerSpan]:
         """Adds logic for sampling from input records if possible."""
         matches = _re_token.finditer(text)
-        record = (
-            self._get_random_record()
-        )  # Sample one record (Dict containing fake values)
+        # Sample one record (Dict containing fake values)
+        record = self._get_random_record()
 
         results: List[FakerSpan] = []
         for match in matches:
@@ -111,7 +110,9 @@ def _match_to_span(self, text: str, **kwargs) -> List[FakerSpan]:
     def format(self, formatter: str, *args: Any, **kwargs: Any) -> str:
         """Fill in fake data. If the input record has the requested entity, return its value."""
         record = kwargs.get("record")
-        if not record or not record.get(formatter):  # type not in record, go to default faker
+        if not record or not record.get(
+            formatter
+        ):  # type not in record, go to default faker
             return super().format(formatter)
 
         return record[formatter]