rootsdev
diff --git a/‎README.md
Lines changed: 12 additions & 15 deletions b/‎README.md
Lines changed: 12 additions & 15 deletions
diff --git a/‎notebooks/205_augment_non_negatives.ipynb renamed to ‎notebooks/205_generate_common_non_negatives.ipynb
Lines changed: 51 additions & 104 deletions b/‎notebooks/205_augment_non_negatives.ipynb renamed to ‎notebooks/205_generate_common_non_negatives.ipynb
Lines changed: 51 additions & 104 deletions
diff --git a/‎notebooks/206_analyze_triplets.ipynb
Lines changed: 19 additions & 5 deletions b/‎notebooks/206_analyze_triplets.ipynb
Lines changed: 19 additions & 5 deletions
@@ -41,19 +41,20 @@ Run notebooks in the order listed
 * 100_train_test_split - split similar names into train and test sets, removing bad pairs
   * input: similar-v2, pref-names, bad-pairs
   * output: train-v2, test-v2
+* 
 * 200_generate_triplets - generate triplets from training data
   * input: train-v2
   * output: triplets
 * 204_generate_subword_tokenizer - train a subword tokenizer
   * input: triplets, pref-names, train-v2
   * output: subword-tokenizer
-* 205_augment_common_non_negatives - augment common non-negatives with additional names
-  * input: common-non-negatives, triplets, name-variants, given-nicknames
-  * output: common-non-negatives-augmented
+* 205_generate_common_non_negatives - generate pairs of names that are not negative examples
+  * input: std-buckets, pref-names, triplets, given-nicknames
+  * output: common-non-negatives
 * 206_analyze_triplets - review triplets (optional)
-  * input: triplets, pref-names, common-non-negatives-augmented,
+  * input: triplets, pref-names, common-non-negatives,
 * 207_augment_triplets - augment triplets with additional triplets
-  * input: triplets, pref-names, common-non-negatives-augmented, subword-tokenizer
+  * input: triplets, pref-names, common-non-negatives, subword-tokenizer
   * output: triplets-augmented
 * 220_create_language_model_dataset - create a dataset to train roberta
   * input: pref-names, tree-hr-parquet
@@ -62,16 +63,16 @@ Run notebooks in the order listed
   * input: all-tree-hr-names-sample, pref-names
   * output: roberta
 * 222_train_cross_encoder - train a cross-encoder model
-  * input: roberta, triplets-augmented, pref-names
+  * input: roberta, triplets-augmented
   * output: cross-encoder
 * 223_generate_triplets_from_cross_encoder - generate triplets for training the bi-encoder from the cross-encoder
-  * input: pref-names, train-v2, common-non-negatives-augmented, std-buckets, cross-encoder
+  * input: pref-names, train-v2, common-non-negatives, std-buckets, cross-encoder
   * output: cross-encoder-triplets-0 and cross-encoder-triplets-common (run twice)
 * 224_train_bi_encoder - train a bi-encoder model
   * input: cross-encoder-triplets-common-0-augmented, subword-tokenizer
   * output: bi-encoder
 * 230_eval_bi_encoder - evaluate a bi-encoder model, used to pick hyperparameters
-  * input: subword-tokenizer, bi-encoder, pref-names, triplets, common-non-negatives-augmented 
+  * input: subword-tokenizer, bi-encoder, pref-names, triplets, common-non-negatives
 * 240_create_clusters_from_buckets - split buckets into clusters using the cross encoder; clusters in the same bucket form a super-cluster
   * input: std-buckets, subword-tokenizer, cross-encoder, bi-encoder, pref-names
   * output: clusters, super-clusters
@@ -101,24 +102,20 @@ Run notebooks in the order listed
   * f"../data/models/bi_encoder-{given_surname}-{model_type}.pth" 
 * clusters - similar names from the same bucket
   * f"../data/processed/clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
-* !!! common-non-negatives - pairs of names that may be similar (are not negative)
-  * f"../references/common_{given_surname}_non_negatives.csv"
-* common-non-negatives-augmented - pairs of names that may be similar (are not negative), augmented
-  * f"../data/processed/common_{given_surname}_non_negatives-augmented.csv" 
+* common-non-negatives - pairs of names that may be similar (are not negative)
+  * f"../data/processed/common_{given_surname}_non_negatives.csv" 
 * cross-encoder - model to evaluate the similarity of two names
   * f"../data/models/cross-encoder-{given_surname}-10m-265-same-all"
 * cross-encoder-triplets-0 - triplets generated from cross-encoder with num_easy_negs=0
   * f"../data/processed/cross-encoder-triplets-{given_surname}-0.csv"
 * cross-encoder-triplets-common - triplets generated from cross-encoder with num_easy_negs='common'
   * f"../data/processed/cross-encoder-triplets-{given_surname}-common.csv"
-* cross-encoder-triplets-common-0-augmented = cross-encoder-triplets-common + cross-encoder-triplets-0 + test-triplets-augmented 
+* cross-encoder-triplets-common-0-augmented = cross-encoder-triplets-common + cross-encoder-triplets-0 + triplets-augmented 
   * f"../data/processed/cross-encoder-triplets-{given_surname}-common-0-augmented.csv"
 * dissimilar-v2 - pairs of names from tree-record attachments that are probably not similar
   * f"s3://familysearch-names/processed/tree-hr-{given_surname}-dissimilar-v2.csv.gz" 
 * given-nicknames - nicknames for given names (hand curated from a variety of sources)
   * f"../references/givenname_nicknames.csv"
-* !!! name-variants - ???
-  * f"../references/{given_surname}_variants.csv"
 * nearby-clusters - for each cluster, list the nearby clusters
   * f"../data/processed/nearby_clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
 * pref-names - preferred names from the tree
 
@@ -16,9 +16,9 @@
    "id": "028823c5",
    "metadata": {},
    "source": [
-    "# Augment common non-negatives\n",
+    "# Generate common non-negatives\n",
     "\n",
-    "Add triplets and name variants and nicknames to common non-negatives"
+    "Add existing standard, triplets, and nicknames to common non-negatives"
    ]
   },
   {
@@ -28,6 +28,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import re\n",
+    "\n",
     "import pandas as pd\n",
     "from tqdm.auto import tqdm\n",
     "\n",
@@ -41,14 +43,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "given_surname = \"given\"\n",
+    "given_surname = \"surname\"\n",
+    "\n",
+    "num_common_names = 10000\n",
     "\n",
-    "common_non_negatives_path = f\"../references/common_{given_surname}_non_negatives.csv\"\n",
+    "pref_path = f\"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz\"\n",
+    "std_path = f\"../references/std_{given_surname}.txt\"\n",
     "triplets_path=f\"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz\"\n",
-    "name_variants_path = f\"../references/{given_surname}_variants.csv\"\n",
     "given_nicknames_path = \"../references/givenname_nicknames.csv\"\n",
     "\n",
-    "augmented_path = f\"../data/processed/common_{given_surname}_non_negatives-augmented.csv\""
+    "non_negatives_path = f\"../data/processed/common_{given_surname}_non_negatives.csv\""
    ]
   },
   {
@@ -64,7 +68,7 @@
    "id": "401ad99c",
    "metadata": {},
    "source": [
-    "### read common non-negatives"
+    "### read preferred names"
    ]
   },
   {
@@ -74,23 +78,52 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "common_non_negatives_df = read_csv(common_non_negatives_path)\n",
-    "print(len(common_non_negatives_df))\n",
-    "common_non_negatives_df.head(3)"
+    "pref_df = read_csv(pref_path)\n",
+    "common_names = set([name for name in pref_df['name'][:num_common_names].tolist() \\\n",
+    "                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)])\n",
+    "len(common_names)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2be4909b",
+   "metadata": {},
+   "source": [
+    "## Start with FS buckets"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ab78be66",
+   "id": "6a359ed7",
    "metadata": {},
    "outputs": [],
    "source": [
+    "common_names_set = set(common_names)\n",
     "common_non_negatives = set()\n",
-    "for name1, name2 in common_non_negatives_df.values.tolist():\n",
-    "    common_non_negatives.add((name1, name2))\n",
-    "    common_non_negatives.add((name2, name1))\n",
-    "len(common_non_negatives)"
+    "\n",
+    "with open(std_path) as f:\n",
+    "    for ix, line in enumerate(f.readlines()):\n",
+    "        line = line.strip()\n",
+    "        head_names, tail_names = line.split(':')\n",
+    "        head_names = head_names.strip()\n",
+    "        tail_names = tail_names.strip()\n",
+    "        names = set()\n",
+    "        if len(head_names):\n",
+    "            names |= set(head_names.split(' '))\n",
+    "        if len(tail_names):\n",
+    "            names |= set(tail_names.split(' '))\n",
+    "        names = [name for name in names if len(name) > 0]\n",
+    "        for name1 in names:\n",
+    "            if name1 not in common_names_set:\n",
+    "                continue\n",
+    "            for name2 in names:\n",
+    "                if name2 not in common_names_set:\n",
+    "                    continue\n",
+    "                if name1 == name2:\n",
+    "                    continue\n",
+    "                common_non_negatives.add((name1, name2))\n",
+    "print(len(common_non_negatives))"
    ]
   },
   {
@@ -132,39 +165,6 @@
     "len(common_non_negatives)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "2a9e8224",
-   "metadata": {},
-   "source": [
-    "### add name variants"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7174dff9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "name_variants_df = read_csv(name_variants_path)\n",
-    "print(len(name_variants_df))\n",
-    "name_variants_df.head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fc2e63ac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for name1, name2 in name_variants_df.values.tolist():\n",
-    "    common_non_negatives.add((name1, name2))\n",
-    "    common_non_negatives.add((name2, name1))\n",
-    "len(common_non_negatives)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "b7d9ce98",
@@ -197,7 +197,7 @@
    "id": "e0df5a95",
    "metadata": {},
    "source": [
-    "## Save augmented non-negatives"
+    "## Save common non-negatives"
    ]
   },
   {
@@ -211,66 +211,13 @@
     "for name1, name2 in common_non_negatives:\n",
     "    records.append({'name1': name1, 'name2': name2})\n",
     "df = pd.DataFrame(records)\n",
-    "df.to_csv(augmented_path, index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "21fa63a6",
-   "metadata": {},
-   "source": [
-    "## Miscellaneous\n",
-    "\n",
-    "Generate common non-negatives from existing standard"
-   ]
-  },
-  {
-   "cell_type": "raw",
-   "id": "9278c668",
-   "metadata": {},
-   "source": [
-    "common_names_set = set(common_names)\n",
-    "\n",
-    "with open(f\"../references/std_{given_surname}.txt\") as f:\n",
-    "    for ix, line in enumerate(f.readlines()):\n",
-    "        line = line.strip()\n",
-    "        head_names, tail_names = line.split(':')\n",
-    "        head_names = head_names.strip()\n",
-    "        tail_names = tail_names.strip()\n",
-    "        names = set()\n",
-    "        if len(head_names):\n",
-    "            names |= set(head_names.split(' '))\n",
-    "        if len(tail_names):\n",
-    "            names |= set(tail_names.split(' '))\n",
-    "        names = [name for name in names if len(name) > 0]\n",
-    "        for i in range(0, len(names)):\n",
-    "            if names[i] not in common_names_set:\n",
-    "                continue\n",
-    "            for j in range(i+1, len(names)):\n",
-    "                if names[j] not in common_names_set:\n",
-    "                    continue\n",
-    "                name1 = names[i]\n",
-    "                name2 = names[j]\n",
-    "                if name1 > name2:\n",
-    "                    name1, name2 = name2, name1\n",
-    "                common_non_negatives.add(f\"{name1}:{name2}\")\n",
-    "print(len(common_non_negatives))\n",
-    "\n",
-    "variants = []\n",
-    "for name_pair in sorted(common_non_negatives):\n",
-    "    name1, name2 = name_pair.split(':')\n",
-    "    if name1 > name2:\n",
-    "        print(\"ERROR\", name1, name2)\n",
-    "    variants.append({\"name1\": name1, \"name2\": name2})\n",
-    "print(len(variants))\n",
-    "df = pd.DataFrame(variants)\n",
-    "df.to_csv(common_non_negatives_path, index=False)"
+    "df.to_csv(non_negatives_path, index=False)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dd31da69",
+   "id": "4a0ac472",
    "metadata": {},
    "outputs": [],
    "source": []
 
@@ -45,14 +45,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "given_surname = \"given\"\n",
+    "given_surname = \"surname\"\n",
     "sample_frac = 1.0\n",
-    "num_common_names = 1000\n",
-    "num_semi_common_names = 1500\n",
+    "num_common_names = 1000 if given_surname == \"given\" else 2500\n",
+    "num_semi_common_names = 1500 if given_surname == \"given\" else 4000\n",
     "\n",
     "pref_path = f\"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz\"\n",
     "triplets_path=f\"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz\"\n",
-    "common_non_negatives_path = f\"../data/processed//common_{given_surname}_non_negatives-augmented.csv\""
+    "common_non_negatives_path = f\"../data/processed/common_{given_surname}_non_negatives.csv\""
    ]
   },
   {
@@ -117,6 +117,17 @@
     "triplets_df[(triplets_df['anchor'] == 'zsuzsanna') | (triplets_df['positive'] == 'zsuzsanna')]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05ce6ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "name = 'quass'\n",
+    "triplets_df[(triplets_df['anchor'] == name) | (triplets_df['positive'] == name)]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "086d25ac",
@@ -312,7 +323,10 @@
    "id": "b09c7c6c",
    "metadata": {},
    "source": [
-    "## Review semi-common non-negatives that aren't represented in anchor-pos pairs"
+    "## Review semi-common non-negatives that aren't represented in anchor-pos pairs\n",
+    "\n",
+    "**TODO:** We should ask someone to review these pairs and take out the non-non-negatives (non-matches), \n",
+    "and then somehow add the remaining matches when we augment the triplets in notebook 207."
    ]
   },
   {