Skip to content

Commit

Permalink
updates to fake name generator
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 committed Jan 15, 2022
1 parent f8d08bf commit c9151c6
Show file tree
Hide file tree
Showing 4 changed files with 3,016 additions and 3,014 deletions.
15 changes: 8 additions & 7 deletions notebooks/1_Generate_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"import datetime\n",
"import pprint\n",
"from collections import Counter\n",
"\n",
"import tqdm\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
Expand Down Expand Up @@ -162,7 +162,8 @@
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
"\n",
"# Update to match existing templates\n",
"PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)"
"fake_name_generator_df = PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)\n",
"fake_name_generator_df.head()"
]
},
{
Expand Down Expand Up @@ -422,7 +423,7 @@
"%%time\n",
"input_samples = [\n",
" InputSample.from_faker_spans_result(faker_spans_result=fake_record)\n",
" for fake_record in fake_records\n",
" for fake_record in tqdm.tqdm(fake_records)\n",
"]"
]
},
Expand Down Expand Up @@ -493,9 +494,9 @@
"source": [
"### Next steps\n",
"\n",
"- Evaluate Presidio using this fake data. [Sample](\"4_Evaluate_Presidio_Analyzer.ipynb\")\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](\"3_Split_by_pattern_#.ipynb\")\n",
"- Conduct a small exploratory data analysis on the generated data. [Sample](\"2_PII_EDA.ipynb\")"
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
]
},
{
Expand Down Expand Up @@ -543,4 +544,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,8 @@ def _get_random_record(self):
def _match_to_span(self, text: str, **kwargs) -> List[FakerSpan]:
"""Adds logic for sampling from input records if possible."""
matches = _re_token.finditer(text)
record = (
self._get_random_record()
) # Sample one record (Dict containing fake values)
# Sample one record (Dict containing fake values)
record = self._get_random_record()

results: List[FakerSpan] = []
for match in matches:
Expand All @@ -111,7 +110,9 @@ def _match_to_span(self, text: str, **kwargs) -> List[FakerSpan]:
def format(self, formatter: str, *args: Any, **kwargs: Any) -> str:
"""Fill in fake data. If the input record has the requested entity, return its value."""
record = kwargs.get("record")
if not record or not record.get(formatter): # type not in record, go to default faker
if not record or not record.get(
formatter
): # type not in record, go to default faker
return super().format(formatter)

return record[formatter]
Loading

0 comments on commit c9151c6

Please sign in to comment.