clulab · damian-romero · Dec 5, 2021
diff --git a/data_wrangling/selection/annotation_dataset_selection.ipynb b/data_wrangling/selection/annotation_dataset_selection.ipynb
@@ -1489,7 +1489,7 @@
       "\n",
       "- No. of enries in your DataFrame: 282\n",
       "- No. of different MITI modified scores: 7,  Global scores: [3.0, 3.3333332538604736, 3.6666667461395264, 4.0, 4.333333492279053, 4.666666507720947, 5.0]\n",
-      "- No. of different education backgrounds: 4,  Education backgrounds: [3.0, 4.0, 5.0, 6.0]\n",
+      "- No. of different education backgrounds: 4,  Education backgrounds: [4.0, 5.0, 6.0, 3.0]\n",
       "- No. of different ages: 142,  Ages range: 33.051334381103516... 74.56810760498047\n",
       "- No. of different races: 3,  Races: [2.0, 3.0, 5.0]\n"
      ]
@@ -1577,8 +1577,8 @@
       "\n",
       "- No. of enries in your DataFrame: 256\n",
       "- No. of different MITI modified scores: 2,  Global scores: [4.0, 4.333333492279053]\n",
-      "- No. of different education backgrounds: 4,  Education backgrounds: [3.0, 4.0, 5.0, 6.0]\n",
-      "- No. of different ages: 133,  Ages range: 33.051334381103516... 148.9007568359375\n",
+      "- No. of different education backgrounds: 4,  Education backgrounds: [4.0, 5.0, 6.0, 3.0]\n",
+      "- No. of different ages: 133,  Ages range: 33.051334381103516... 74.56810760498047\n",
       "- No. of different races: 3,  Races: [2.0, 3.0, 5.0]\n",
       "\n",
       "\n",
@@ -2086,7 +2086,7 @@
       "- No. of different ages: 14,  Ages range: 33.15263366699219... 71.16221618652344\n",
       "- No. of different races: 4,  Races: [3.0, 4.0, 5.0, 6.0]\n",
       "- >>> There are 3 participant IDs repeated in your sample: ***\n",
-      " {'838-0225-001', '141-0225-009', '854-0225-005'}\n"
+      " {'141-0225-009', '838-0225-001', '854-0225-005'}\n"
      ]
     }
    ],
@@ -2203,7 +2203,7 @@
       "- No. of different ages: 32,  Ages range: 33.15263366699219... 82.46954345703125\n",
       "- No. of different races: 4,  Races: [3.0, 4.0, 5.0, 6.0]\n",
       "- >>> There are 3 participant IDs repeated in your sample: ***\n",
-      " {'838-0225-001', '141-0225-009', '854-0225-005'}\n"
+      " {'141-0225-009', '838-0225-001', '854-0225-005'}\n"
      ]
     }
    ],
@@ -2323,10 +2323,9 @@
    "source": [
     "## VI. Sampling the Spanish dataset by global rating\n",
     "\n",
-    "Total sampled English calls should be (n=30) plus the ones we already sampled from the pilot (3):\n",
-    "- We want at least 5 to be from the same participant for a case study (Alexandra still needs to tell me who this is)\n",
-    "- 35 interviews x 2 annotators | 1 week x annotator = 18 weeks: Nov - May (taking into account holidays etc)\n",
-    "- Sample the first half with non-Latinas' interviews and second half with English-speaking Latinas' interviews"
+    "Total sampled Spanish calls should be similar to the English sample (n=36)\n",
+    "- We already sampled 3 calls from the pilot\n",
+    "- We want at least 5 to be from the same participant(s) for a case study (TODO later)"
    ]
   },
   {
@@ -2504,7 +2503,7 @@
       "- No. of different ages: 10,  Ages range: 41.040382385253906... 68.49007415771484\n",
       "- No. of different races: 3,  Races: [0.0, 5.0, 6.0]\n",
       "- >>> There are 10 participant IDs repeated in your sample: ***\n",
-      " {'259-0225-016', '022-0225-015', '011-0225-006', '177-0225-002', '037-0225-003', '177-0225-001', '177-0225-007', '078-0225-004', '141-0225-007', '817-0225-002'}\n"
+      " {'177-0225-002', '177-0225-007', '817-0225-002', '078-0225-004', '177-0225-001', '037-0225-003', '011-0225-006', '141-0225-007', '022-0225-015', '259-0225-016'}\n"
      ]
     }
    ],
@@ -2592,7 +2591,7 @@
       "- No. of different ages: 10,  Ages range: 41.040382385253906... 68.49007415771484\n",
       "- No. of different races: 3,  Races: [0.0, 5.0, 6.0]\n",
       "- >>> There are 10 participant IDs repeated in your sample: ***\n",
-      " {'259-0225-016', '022-0225-015', '177-0225-002', '011-0225-006', '037-0225-003', '177-0225-001', '177-0225-007', '078-0225-004', '141-0225-007', '817-0225-002'}\n"
+      " {'177-0225-002', '177-0225-007', '817-0225-002', '078-0225-004', '177-0225-001', '037-0225-003', '011-0225-006', '141-0225-007', '022-0225-015', '259-0225-016'}\n"
      ]
     }
    ],
@@ -2707,15 +2706,74 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d65c7950",
+   "id": "04849aa4",
    "metadata": {},
    "source": [
-    "## VI. Join Engish and Spanish samples"
+    "### VI. II. Random sample from available interviews provided by Hagan\n",
+    "\n",
+    "- Calls live securely here: https://arizona.box.com/s/nkoc8hi0hqltsx6dtecimqumpztqdo4m\n",
+    "- Save contents in `./data` as `unzip -l Latina_calls.zip > latina_calls.txt`\n",
+    "- We'll download the zipped folder, create a list of wav files per participant and randomly pick from those\n",
+    "- There are two participants for whom I don't have interviews yet: `177-0225-007` `817-0225-002` and `259-0225-016`"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 51,
+   "id": "948a171d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get interview file names and participant IDs to later random selection\n",
+    "es_interview_dict = defaultdict(list)\n",
+    "\n",
+    "with open('./data/latina_calls.txt') as lc_f:\n",
+    "    for line in lc_f:\n",
+    "        if line.endswith('wav\\n'):\n",
+    "            line = line.rstrip()\n",
+    "            prefix, sid, interview_ID = line.split('/')[-3:]\n",
+    "            if prefix == 'Full':\n",
+    "                es_interview_dict[sid].append(interview_ID.removesuffix('.wav'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "7b23a083",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Delete these when we have the data from Hagan\n",
+    "skip_ids = ['177-0225-007', '817-0225-002', '259-0225-016']\n",
+    "selected_interviews = ['']\n",
+    "\n",
+    "for i, r, in spanish_interview_sample_for_annotation.iterrows():\n",
+    "    i_iID = r['interview_ID']\n",
+    "    isid = r['sid']\n",
+    "    if i_iID.startswith('RE') or isid in skip_ids:\n",
+    "        continue\n",
+    "    else:\n",
+    "        rand_interview = np.random.choice(es_interview_dict[isid])\n",
+    "        while rand_interview in selected_interviews:\n",
+    "            rand_interview = np.random.choice(es_interview_dict[isid])\n",
+    "        selected_interviews.append(rand_interview)\n",
+    "        spanish_interview_sample_for_annotation.replace(i_iID, rand_interview, inplace=True)\n",
+    "\n",
+    "assert (len(spanish_interview_sample_for_annotation['interview_ID'])\n",
+    "        == len(spanish_interview_sample_for_annotation['interview_ID'].unique()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d65c7950",
+   "metadata": {},
+   "source": [
+    "## VII. Join Engish and Spanish samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
    "id": "e9d50db2",
    "metadata": {},
    "outputs": [
@@ -2742,14 +2800,13 @@
       "- No. of different ages: 42,  Ages range: 33.15263366699219... 82.46954345703125\n",
       "- No. of different races: 5,  Races: [0.0, 3.0, 4.0, 5.0, 6.0]\n",
       "- >>> There are 13 participant IDs repeated in your sample: ***\n",
-      " {'838-0225-001', '259-0225-016', '022-0225-015', '177-0225-002', '011-0225-006', '854-0225-005', '037-0225-003', '177-0225-001', '177-0225-007', '078-0225-004', '141-0225-009', '141-0225-007', '817-0225-002'}\n"
+      " {'177-0225-002', '177-0225-007', '838-0225-001', '817-0225-002', '141-0225-009', '078-0225-004', '177-0225-001', '037-0225-003', '011-0225-006', '854-0225-005', '141-0225-007', '022-0225-015', '259-0225-016'}\n"
      ]
     }
    ],
    "source": [
     "spanish_interview_sample_for_annotation.assign(language='Spanish')\n",
     "english_interview_sample_for_annotation.assign(language='English')\n",
-    "# english_interview_sample_for_annotation\n",
     "# df.insert(0, 'Language', 'Spanish')\n",
     "# df.insert(0, 'Language', 'English')\n",
     "full_interview_sample_for_annotation = pd.concat(\\\n",
@@ -2763,7 +2820,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 54,
    "id": "858235ef",
    "metadata": {},
    "outputs": [
@@ -2823,7 +2880,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 55,
    "id": "445d0c01",
    "metadata": {},
    "outputs": [],
@@ -2841,7 +2898,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 56,
    "id": "60f1b495",
    "metadata": {},
    "outputs": [],