Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
__pycache__/
# vscode files
.vscode
# autosklearn
3.ML_model/autosklearn.ipynb
Binary file modified 1.format_data/data/training_data.csv.gz
Binary file not shown.
143 changes: 87 additions & 56 deletions 1.format_data/format_training_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,30 @@
" return None\n",
"\n",
"\n",
"def get_cell_control(\n",
" plate: str,\n",
" well: str,\n",
" idr_metadata: pd.DataFrame,\n",
"):\n",
"\n",
" cell_annotations = idr_metadata.loc[\n",
" (plate == idr_metadata[\"Plate\"]) & (idr_metadata[\"Well Number\"].astype(int) == int(well))\n",
" ]\n",
" control_type = cell_annotations.iloc[0][\"Control Type\"]\n",
" \n",
" if control_type == \"positive control\":\n",
" return \"positive\"\n",
" elif control_type == \"negative control\":\n",
" return \"negative\"\n",
" else:\n",
" return \"none\"\n",
"\n",
"\n",
"def complete_single_cell(\n",
" single_cell_data: pd.DataFrame,\n",
" trainingset_file_url: str,\n",
" segmentation_data_dir: str,\n",
" idr_metadata: pd.DataFrame,\n",
") -> pd.DataFrame:\n",
" \"\"\"Add Mitocheck_Object_ID and Mitocheck_Phenotypic_Class fields to single cell data by matching cell object ID to phenotypic class given in traininset.dat\n",
"\n",
Expand Down Expand Up @@ -129,25 +149,25 @@
" \"Mitocheck_Object_ID\",\n",
" cell_segmentation_data[\"Mitocheck_Object_ID\"].item(),\n",
" )\n",
" # get class and append to single cell data\n",
" cell_phenotypic_class = get_cell_class(\n",
" single_cell_data, trainingset_file_url, plate, well, frame\n",
" )\n",
" if cell_phenotypic_class == None:\n",
" print(\"This cell was not found in trainingset.dat!\")\n",
" single_cell_data.insert(0, \"Mitocheck_Phenotypic_Class\", cell_phenotypic_class)\n",
"\n",
" cell_control_type = get_cell_control(plate, well, idr_metadata)\n",
" single_cell_data.insert(1, \"Control_Type\", cell_control_type)\n",
"\n",
" return single_cell_data\n",
"\n",
"\n",
"def format_training_data(\n",
" mitocheck_data_version_url: str, save_path: pathlib.Path, compression: str\n",
") -> pd.DataFrame:\n",
"def format_training_data(mitocheck_data_version_url: str) -> pd.DataFrame:\n",
" \"\"\"Add Mitocheck_Object_ID and Mitocheck_Phenotypic_Class fields to each single cell and compile all the cells into a single training data dataframe\n",
"\n",
" Args:\n",
" mitocheck_data_version_url (str): url with path to desired version of raw mitocheck_data\n",
" save_path (pathlib.Path): path to save training data\n",
" compression (str): type of compression to use when saving dataframe\n",
"\n",
" Returns:\n",
" pd.DataFrame: completed training data with Mitocheck_Object_ID and Mitocheck_Phenotypic_Class for each cell\n",
Expand All @@ -157,20 +177,23 @@
" )\n",
" segmentation_data_dir = f\"{mitocheck_data_version_url}/2.segment_nuclei/segmented/\"\n",
" preprocessed_features_url = f\"{mitocheck_data_version_url}/4.preprocess_features/data/normalized_training_data.csv.gz\"\n",
" idr_metadata_url = f\"{mitocheck_data_version_url}/3.extract_features/idr0013-screenA-annotation.csv.gz\"\n",
"\n",
" preprocessed_features = pd.read_csv(preprocessed_features_url, compression=\"gzip\")\n",
" print(\"Loaded preprocessed features!\")\n",
"\n",
" idr_metadata = pd.read_csv(idr_metadata_url, dtype=object, compression=\"gzip\")\n",
" print(\"Loaded idr metadata!\")\n",
"\n",
" training_data = []\n",
" for index, row in preprocessed_features.iterrows():\n",
" single_cell = row\n",
" completed_single_cell = complete_single_cell(\n",
" single_cell, trainingset_file_url, segmentation_data_dir\n",
" single_cell, trainingset_file_url, segmentation_data_dir, idr_metadata\n",
" )\n",
" training_data.append(completed_single_cell)\n",
"\n",
" training_data = pd.concat(training_data)\n",
" training_data.to_csv(save_path, compression=compression)\n",
" return training_data"
]
},
Expand All @@ -191,6 +214,7 @@
"output_type": "stream",
"text": [
"Loaded preprocessed features!\n",
"Loaded idr metadata!\n",
"Processed cell at: LT0043_48/166/48, location: (263, 20)\n",
"Processed cell at: LT0043_48/166/48, location: (240, 28)\n",
"Processed cell at: LT0043_48/166/48, location: (253, 36)\n",
Expand Down Expand Up @@ -4511,9 +4535,9 @@
"output_dir = pathlib.Path(\"data/\")\n",
"output_dir.mkdir(parents=True, exist_ok=True)\n",
"save_path = pathlib.Path(f\"{output_dir}/training_data.csv.gz\")\n",
"compression = \"gzip\"\n",
"\n",
"training_data = format_training_data(mitocheck_data_version_url, save_path, compression)"
"training_data = format_training_data(mitocheck_data_version_url)\n",
"training_data.to_csv(save_path, compression=\"gzip\")"
]
},
{
Expand All @@ -4525,7 +4549,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(4308, 1292)\n"
"(4308, 1293)\n"
]
},
{
Expand All @@ -4550,6 +4574,7 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Mitocheck_Phenotypic_Class</th>\n",
" <th>Control_Type</th>\n",
" <th>Mitocheck_Object_ID</th>\n",
" <th>Location_Center_X</th>\n",
" <th>Location_Center_Y</th>\n",
Expand All @@ -4558,7 +4583,6 @@
" <th>Metadata_Site</th>\n",
" <th>Metadata_Plate_Map_Name</th>\n",
" <th>Metadata_DNA</th>\n",
" <th>Metadata_Gene</th>\n",
" <th>...</th>\n",
" <th>efficientnet_1270</th>\n",
" <th>efficientnet_1271</th>\n",
Expand All @@ -4576,6 +4600,7 @@
" <tr>\n",
" <th>0</th>\n",
" <td>ADCCM</td>\n",
" <td>none</td>\n",
" <td>13.0</td>\n",
" <td>262.777778</td>\n",
" <td>20.126984</td>\n",
Expand All @@ -4584,7 +4609,6 @@
" <td>1</td>\n",
" <td>LT0043_48_166_48</td>\n",
" <td>LT0043_48/166/48/LT0043_48_166_48.tif</td>\n",
" <td>OGG1</td>\n",
" <td>...</td>\n",
" <td>0.207932</td>\n",
" <td>-0.736547</td>\n",
Expand All @@ -4600,6 +4624,7 @@
" <tr>\n",
" <th>1</th>\n",
" <td>ADCCM</td>\n",
" <td>none</td>\n",
" <td>13.0</td>\n",
" <td>239.517241</td>\n",
" <td>28.206897</td>\n",
Expand All @@ -4608,7 +4633,6 @@
" <td>1</td>\n",
" <td>LT0043_48_166_48</td>\n",
" <td>LT0043_48/166/48/LT0043_48_166_48.tif</td>\n",
" <td>OGG1</td>\n",
" <td>...</td>\n",
" <td>0.38972</td>\n",
" <td>-0.562691</td>\n",
Expand All @@ -4624,6 +4648,7 @@
" <tr>\n",
" <th>2</th>\n",
" <td>ADCCM</td>\n",
" <td>none</td>\n",
" <td>13.0</td>\n",
" <td>252.980392</td>\n",
" <td>35.862745</td>\n",
Expand All @@ -4632,7 +4657,6 @@
" <td>1</td>\n",
" <td>LT0043_48_166_48</td>\n",
" <td>LT0043_48/166/48/LT0043_48_166_48.tif</td>\n",
" <td>OGG1</td>\n",
" <td>...</td>\n",
" <td>-0.154282</td>\n",
" <td>-0.519065</td>\n",
Expand All @@ -4648,6 +4672,7 @@
" <tr>\n",
" <th>3</th>\n",
" <td>ADCCM</td>\n",
" <td>none</td>\n",
" <td>13.0</td>\n",
" <td>258.288462</td>\n",
" <td>46.038462</td>\n",
Expand All @@ -4656,7 +4681,6 @@
" <td>1</td>\n",
" <td>LT0043_48_166_48</td>\n",
" <td>LT0043_48/166/48/LT0043_48_166_48.tif</td>\n",
" <td>OGG1</td>\n",
" <td>...</td>\n",
" <td>-0.298543</td>\n",
" <td>-0.587031</td>\n",
Expand All @@ -4672,6 +4696,7 @@
" <tr>\n",
" <th>4</th>\n",
" <td>Shape3</td>\n",
" <td>none</td>\n",
" <td>10.0</td>\n",
" <td>1212.640449</td>\n",
" <td>21.314607</td>\n",
Expand All @@ -4680,7 +4705,6 @@
" <td>1</td>\n",
" <td>LT0043_48_166_55</td>\n",
" <td>LT0043_48/166/55/LT0043_48_166_55.tif</td>\n",
" <td>OGG1</td>\n",
" <td>...</td>\n",
" <td>1.764085</td>\n",
" <td>-0.364659</td>\n",
Expand All @@ -4695,53 +4719,60 @@
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 1292 columns</p>\n",
"<p>5 rows × 1293 columns</p>\n",
"</div>"
],
"text/plain": [
" Mitocheck_Phenotypic_Class Mitocheck_Object_ID Location_Center_X \\\n",
"0 ADCCM 13.0 262.777778 \n",
"1 ADCCM 13.0 239.517241 \n",
"2 ADCCM 13.0 252.980392 \n",
"3 ADCCM 13.0 258.288462 \n",
"4 Shape3 10.0 1212.640449 \n",
" Mitocheck_Phenotypic_Class Control_Type Mitocheck_Object_ID \\\n",
"0 ADCCM none 13.0 \n",
"1 ADCCM none 13.0 \n",
"2 ADCCM none 13.0 \n",
"3 ADCCM none 13.0 \n",
"4 Shape3 none 10.0 \n",
"\n",
" Location_Center_X Location_Center_Y Metadata_Plate Metadata_Well \\\n",
"0 262.777778 20.126984 LT0043_48 166_48 \n",
"1 239.517241 28.206897 LT0043_48 166_48 \n",
"2 252.980392 35.862745 LT0043_48 166_48 \n",
"3 258.288462 46.038462 LT0043_48 166_48 \n",
"4 1212.640449 21.314607 LT0043_48 166_55 \n",
"\n",
" Location_Center_Y Metadata_Plate Metadata_Well Metadata_Site \\\n",
"0 20.126984 LT0043_48 166_48 1 \n",
"1 28.206897 LT0043_48 166_48 1 \n",
"2 35.862745 LT0043_48 166_48 1 \n",
"3 46.038462 LT0043_48 166_48 1 \n",
"4 21.314607 LT0043_48 166_55 1 \n",
" Metadata_Site Metadata_Plate_Map_Name \\\n",
"0 1 LT0043_48_166_48 \n",
"1 1 LT0043_48_166_48 \n",
"2 1 LT0043_48_166_48 \n",
"3 1 LT0043_48_166_48 \n",
"4 1 LT0043_48_166_55 \n",
"\n",
" Metadata_Plate_Map_Name Metadata_DNA \\\n",
"0 LT0043_48_166_48 LT0043_48/166/48/LT0043_48_166_48.tif \n",
"1 LT0043_48_166_48 LT0043_48/166/48/LT0043_48_166_48.tif \n",
"2 LT0043_48_166_48 LT0043_48/166/48/LT0043_48_166_48.tif \n",
"3 LT0043_48_166_48 LT0043_48/166/48/LT0043_48_166_48.tif \n",
"4 LT0043_48_166_55 LT0043_48/166/55/LT0043_48_166_55.tif \n",
" Metadata_DNA ... efficientnet_1270 \\\n",
"0 LT0043_48/166/48/LT0043_48_166_48.tif ... 0.207932 \n",
"1 LT0043_48/166/48/LT0043_48_166_48.tif ... 0.38972 \n",
"2 LT0043_48/166/48/LT0043_48_166_48.tif ... -0.154282 \n",
"3 LT0043_48/166/48/LT0043_48_166_48.tif ... -0.298543 \n",
"4 LT0043_48/166/55/LT0043_48_166_55.tif ... 1.764085 \n",
"\n",
" Metadata_Gene ... efficientnet_1270 efficientnet_1271 efficientnet_1272 \\\n",
"0 OGG1 ... 0.207932 -0.736547 0.010863 \n",
"1 OGG1 ... 0.38972 -0.562691 -0.044208 \n",
"2 OGG1 ... -0.154282 -0.519065 0.584269 \n",
"3 OGG1 ... -0.298543 -0.587031 0.838506 \n",
"4 OGG1 ... 1.764085 -0.364659 -0.623983 \n",
" efficientnet_1271 efficientnet_1272 efficientnet_1273 efficientnet_1274 \\\n",
"0 -0.736547 0.010863 0.290715 -0.508518 \n",
"1 -0.562691 -0.044208 -0.159093 -0.605761 \n",
"2 -0.519065 0.584269 0.860831 -0.446671 \n",
"3 -0.587031 0.838506 1.16317 -0.083327 \n",
"4 -0.364659 -0.623983 0.087524 -0.678471 \n",
"\n",
" efficientnet_1273 efficientnet_1274 efficientnet_1275 efficientnet_1276 \\\n",
"0 0.290715 -0.508518 -0.666912 0.527043 \n",
"1 -0.159093 -0.605761 -0.605434 0.3765 \n",
"2 0.860831 -0.446671 -0.409693 0.383752 \n",
"3 1.16317 -0.083327 -0.20665 0.253444 \n",
"4 0.087524 -0.678471 -1.04743 0.1197 \n",
" efficientnet_1275 efficientnet_1276 efficientnet_1277 efficientnet_1278 \\\n",
"0 -0.666912 0.527043 -0.216474 0.659347 \n",
"1 -0.605434 0.3765 -0.496571 0.028506 \n",
"2 -0.409693 0.383752 -0.343047 -0.370232 \n",
"3 -0.20665 0.253444 -0.084782 0.073759 \n",
"4 -1.04743 0.1197 0.254014 0.080685 \n",
"\n",
" efficientnet_1277 efficientnet_1278 efficientnet_1279 \n",
"0 -0.216474 0.659347 -0.692728 \n",
"1 -0.496571 0.028506 -0.152331 \n",
"2 -0.343047 -0.370232 0.267983 \n",
"3 -0.084782 0.073759 -0.251357 \n",
"4 0.254014 0.080685 -0.808582 \n",
" efficientnet_1279 \n",
"0 -0.692728 \n",
"1 -0.152331 \n",
"2 0.267983 \n",
"3 -0.251357 \n",
"4 -0.808582 \n",
"\n",
"[5 rows x 1292 columns]"
"[5 rows x 1293 columns]"
]
},
"execution_count": 4,
Expand All @@ -4757,7 +4788,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.13 ('1.format_training_data')",
"display_name": "Python 3.8.13 ('2.ML_phenotypic_classification')",
"language": "python",
"name": "python3"
},
Expand All @@ -4776,7 +4807,7 @@
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ae034360bdddb129fa85f9eede9cbeb42bd6cc20eebc1ae813504c3c5ca537a5"
"hash": "4cc408a06ad49ae0c78cd765de22f61d31a0f8b0861ec15e52107dd82d811e52"
}
}
},
Expand Down
Loading