|
16 | 16 | "id": "028823c5",
|
17 | 17 | "metadata": {},
|
18 | 18 | "source": [
|
19 |
| - "# Augment common non-negatives\n", |
| 19 | + "# Generate common non-negatives\n", |
20 | 20 | "\n",
|
21 |
| - "Add triplets and name variants and nicknames to common non-negatives" |
| 21 | + "Add existing standard, triplets, and nicknames to common non-negatives" |
22 | 22 | ]
|
23 | 23 | },
|
24 | 24 | {
|
|
28 | 28 | "metadata": {},
|
29 | 29 | "outputs": [],
|
30 | 30 | "source": [
|
| 31 | + "import re\n", |
| 32 | + "\n", |
31 | 33 | "import pandas as pd\n",
|
32 | 34 | "from tqdm.auto import tqdm\n",
|
33 | 35 | "\n",
|
|
41 | 43 | "metadata": {},
|
42 | 44 | "outputs": [],
|
43 | 45 | "source": [
|
44 |
| - "given_surname = \"given\"\n", |
| 46 | + "given_surname = \"surname\"\n", |
| 47 | + "\n", |
| 48 | + "num_common_names = 10000\n", |
45 | 49 | "\n",
|
46 |
| - "common_non_negatives_path = f\"../references/common_{given_surname}_non_negatives.csv\"\n", |
| 50 | + "pref_path = f\"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz\"\n", |
| 51 | + "std_path = f\"../references/std_{given_surname}.txt\"\n", |
47 | 52 | "triplets_path=f\"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz\"\n",
|
48 |
| - "name_variants_path = f\"../references/{given_surname}_variants.csv\"\n", |
49 | 53 | "given_nicknames_path = \"../references/givenname_nicknames.csv\"\n",
|
50 | 54 | "\n",
|
51 |
| - "augmented_path = f\"../data/processed/common_{given_surname}_non_negatives-augmented.csv\"" |
| 55 | + "non_negatives_path = f\"../data/processed/common_{given_surname}_non_negatives.csv\"" |
52 | 56 | ]
|
53 | 57 | },
|
54 | 58 | {
|
|
64 | 68 | "id": "401ad99c",
|
65 | 69 | "metadata": {},
|
66 | 70 | "source": [
|
67 |
| - "### read common non-negatives" |
| 71 | + "### read preferred names" |
68 | 72 | ]
|
69 | 73 | },
|
70 | 74 | {
|
|
74 | 78 | "metadata": {},
|
75 | 79 | "outputs": [],
|
76 | 80 | "source": [
|
77 |
| - "common_non_negatives_df = read_csv(common_non_negatives_path)\n", |
78 |
| - "print(len(common_non_negatives_df))\n", |
79 |
| - "common_non_negatives_df.head(3)" |
| 81 | + "pref_df = read_csv(pref_path)\n", |
| 82 | + "common_names = set([name for name in pref_df['name'][:num_common_names].tolist() \\\n", |
| 83 | + " if len(name) > 1 and re.fullmatch(r'[a-z]+', name)])\n", |
| 84 | + "len(common_names)" |
| 85 | + ] |
| 86 | + }, |
| 87 | + { |
| 88 | + "cell_type": "markdown", |
| 89 | + "id": "2be4909b", |
| 90 | + "metadata": {}, |
| 91 | + "source": [ |
| 92 | + "## Start with FS buckets" |
80 | 93 | ]
|
81 | 94 | },
|
82 | 95 | {
|
83 | 96 | "cell_type": "code",
|
84 | 97 | "execution_count": null,
|
85 |
| - "id": "ab78be66", |
| 98 | + "id": "6a359ed7", |
86 | 99 | "metadata": {},
|
87 | 100 | "outputs": [],
|
88 | 101 | "source": [
|
| 102 | + "common_names_set = set(common_names)\n", |
89 | 103 | "common_non_negatives = set()\n",
|
90 |
| - "for name1, name2 in common_non_negatives_df.values.tolist():\n", |
91 |
| - " common_non_negatives.add((name1, name2))\n", |
92 |
| - " common_non_negatives.add((name2, name1))\n", |
93 |
| - "len(common_non_negatives)" |
| 104 | + "\n", |
| 105 | + "with open(std_path) as f:\n", |
| 106 | + " for ix, line in enumerate(f.readlines()):\n", |
| 107 | + " line = line.strip()\n", |
| 108 | + " head_names, tail_names = line.split(':')\n", |
| 109 | + " head_names = head_names.strip()\n", |
| 110 | + " tail_names = tail_names.strip()\n", |
| 111 | + " names = set()\n", |
| 112 | + " if len(head_names):\n", |
| 113 | + " names |= set(head_names.split(' '))\n", |
| 114 | + " if len(tail_names):\n", |
| 115 | + " names |= set(tail_names.split(' '))\n", |
| 116 | + " names = [name for name in names if len(name) > 0]\n", |
| 117 | + " for name1 in names:\n", |
| 118 | + " if name1 not in common_names_set:\n", |
| 119 | + " continue\n", |
| 120 | + " for name2 in names:\n", |
| 121 | + " if name2 not in common_names_set:\n", |
| 122 | + " continue\n", |
| 123 | + " if name1 == name2:\n", |
| 124 | + " continue\n", |
| 125 | + " common_non_negatives.add((name1, name2))\n", |
| 126 | + "print(len(common_non_negatives))" |
94 | 127 | ]
|
95 | 128 | },
|
96 | 129 | {
|
|
132 | 165 | "len(common_non_negatives)"
|
133 | 166 | ]
|
134 | 167 | },
|
135 |
| - { |
136 |
| - "cell_type": "markdown", |
137 |
| - "id": "2a9e8224", |
138 |
| - "metadata": {}, |
139 |
| - "source": [ |
140 |
| - "### add name variants" |
141 |
| - ] |
142 |
| - }, |
143 |
| - { |
144 |
| - "cell_type": "code", |
145 |
| - "execution_count": null, |
146 |
| - "id": "7174dff9", |
147 |
| - "metadata": {}, |
148 |
| - "outputs": [], |
149 |
| - "source": [ |
150 |
| - "name_variants_df = read_csv(name_variants_path)\n", |
151 |
| - "print(len(name_variants_df))\n", |
152 |
| - "name_variants_df.head(3)" |
153 |
| - ] |
154 |
| - }, |
155 |
| - { |
156 |
| - "cell_type": "code", |
157 |
| - "execution_count": null, |
158 |
| - "id": "fc2e63ac", |
159 |
| - "metadata": {}, |
160 |
| - "outputs": [], |
161 |
| - "source": [ |
162 |
| - "for name1, name2 in name_variants_df.values.tolist():\n", |
163 |
| - " common_non_negatives.add((name1, name2))\n", |
164 |
| - " common_non_negatives.add((name2, name1))\n", |
165 |
| - "len(common_non_negatives)" |
166 |
| - ] |
167 |
| - }, |
168 | 168 | {
|
169 | 169 | "cell_type": "markdown",
|
170 | 170 | "id": "b7d9ce98",
|
|
197 | 197 | "id": "e0df5a95",
|
198 | 198 | "metadata": {},
|
199 | 199 | "source": [
|
200 |
| - "## Save augmented non-negatives" |
| 200 | + "## Save common non-negatives" |
201 | 201 | ]
|
202 | 202 | },
|
203 | 203 | {
|
|
211 | 211 | "for name1, name2 in common_non_negatives:\n",
|
212 | 212 | " records.append({'name1': name1, 'name2': name2})\n",
|
213 | 213 | "df = pd.DataFrame(records)\n",
|
214 |
| - "df.to_csv(augmented_path, index=False)" |
215 |
| - ] |
216 |
| - }, |
217 |
| - { |
218 |
| - "cell_type": "markdown", |
219 |
| - "id": "21fa63a6", |
220 |
| - "metadata": {}, |
221 |
| - "source": [ |
222 |
| - "## Miscellaneous\n", |
223 |
| - "\n", |
224 |
| - "Generate common non-negatives from existing standard" |
225 |
| - ] |
226 |
| - }, |
227 |
| - { |
228 |
| - "cell_type": "raw", |
229 |
| - "id": "9278c668", |
230 |
| - "metadata": {}, |
231 |
| - "source": [ |
232 |
| - "common_names_set = set(common_names)\n", |
233 |
| - "\n", |
234 |
| - "with open(f\"../references/std_{given_surname}.txt\") as f:\n", |
235 |
| - " for ix, line in enumerate(f.readlines()):\n", |
236 |
| - " line = line.strip()\n", |
237 |
| - " head_names, tail_names = line.split(':')\n", |
238 |
| - " head_names = head_names.strip()\n", |
239 |
| - " tail_names = tail_names.strip()\n", |
240 |
| - " names = set()\n", |
241 |
| - " if len(head_names):\n", |
242 |
| - " names |= set(head_names.split(' '))\n", |
243 |
| - " if len(tail_names):\n", |
244 |
| - " names |= set(tail_names.split(' '))\n", |
245 |
| - " names = [name for name in names if len(name) > 0]\n", |
246 |
| - " for i in range(0, len(names)):\n", |
247 |
| - " if names[i] not in common_names_set:\n", |
248 |
| - " continue\n", |
249 |
| - " for j in range(i+1, len(names)):\n", |
250 |
| - " if names[j] not in common_names_set:\n", |
251 |
| - " continue\n", |
252 |
| - " name1 = names[i]\n", |
253 |
| - " name2 = names[j]\n", |
254 |
| - " if name1 > name2:\n", |
255 |
| - " name1, name2 = name2, name1\n", |
256 |
| - " common_non_negatives.add(f\"{name1}:{name2}\")\n", |
257 |
| - "print(len(common_non_negatives))\n", |
258 |
| - "\n", |
259 |
| - "variants = []\n", |
260 |
| - "for name_pair in sorted(common_non_negatives):\n", |
261 |
| - " name1, name2 = name_pair.split(':')\n", |
262 |
| - " if name1 > name2:\n", |
263 |
| - " print(\"ERROR\", name1, name2)\n", |
264 |
| - " variants.append({\"name1\": name1, \"name2\": name2})\n", |
265 |
| - "print(len(variants))\n", |
266 |
| - "df = pd.DataFrame(variants)\n", |
267 |
| - "df.to_csv(common_non_negatives_path, index=False)" |
| 214 | + "df.to_csv(non_negatives_path, index=False)" |
268 | 215 | ]
|
269 | 216 | },
|
270 | 217 | {
|
271 | 218 | "cell_type": "code",
|
272 | 219 | "execution_count": null,
|
273 |
| - "id": "dd31da69", |
| 220 | + "id": "4a0ac472", |
274 | 221 | "metadata": {},
|
275 | 222 | "outputs": [],
|
276 | 223 | "source": []
|
|
0 commit comments