WayScience · axiomcura · Feb 6, 2026 · Jan 24, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
 
   # Ruff for linting and formatting Python files
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.14
+    rev: v0.15.0
     hooks:
     -   id: ruff-check
         args: ["--fix"]

diff --git a/notebooks/4.cpjump1-analysis/1.generate-on-off-signatures.ipynb b/notebooks/4.cpjump1-analysis/1.generate-on-off-signatures.ipynb
diff --git a/notebooks/4.cpjump1-analysis/2.assess-heterogeneity.ipynb b/notebooks/4.cpjump1-analysis/2.assess-heterogeneity.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9b390517",
+   "metadata": {},
+   "source": [
+    "# Assessing Heterogeneity\n",
+    "\n",
+    "In this notebook, we assess cellular heterogeneity using community-based clustering. We utilize the `optimized_clustering` function from the `heterogeneity` module, which leverages Optuna to perform hyperparameter optimization for clustering. The optimization process uses the silhouette score to identify the most suitable clustering parameters.\n",
+    "\n",
+    "Clustering is performed on a per-treatment basis, meaning that single cells are clustered separately for each treatment group. This approach allows us to capture the diversity within cell populations exposed to different treatments, rather than applying a global clustering across all cells."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f932ad73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import json\n",
+    "import logging\n",
+    "import pathlib\n",
+    "\n",
+    "import polars as pl\n",
+    "\n",
+    "sys.path.append(\"../../\")\n",
+    "from utils.io_utils import load_profiles\n",
+    "from utils.preprocess import apply_pca\n",
+    "from utils.data_utils import split_meta_and_features\n",
+    "from utils.heterogeneity import optimized_clustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52a776fb",
+   "metadata": {},
+   "source": [
+    "Setting parametes for the notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbdaa864",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# parameters used for clustering optimization\n",
+    "cfret_screen_cluster_param_grid = {\n",
+    "    # Clustering resolution: how granular the clusters should be\n",
+    "    \"cluster_resolution\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.2},\n",
+    "    # Number of neighbors for graph construction\n",
+    "    \"n_neighbors\": {\"type\": \"int\", \"low\": 5, \"high\": 100},\n",
+    "    # Clustering algorithm\n",
+    "    \"cluster_method\": {\"type\": \"categorical\", \"choices\": [\"leiden\"]},\n",
+    "    # Distance metric for neighbor computation\n",
+    "    \"neighbor_distance_metric\": {\n",
+    "        \"type\": \"categorical\",\n",
+    "        \"choices\": [\"euclidean\", \"cosine\", \"manhattan\"],\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbb31cd4",
+   "metadata": {},
+   "source": [
+    "Setting input and output paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0bcd0e60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setting data directory\n",
+    "data_dir = pathlib.Path(\"../0.download-data/data/sc-profiles/\").resolve(strict=True)\n",
+    "\"../0.download-data/data/sc-profiles/cpjump1/cp\"\n",
+    "\n",
+    "# setting CPJUMP1 profiles path\n",
+    "cpjump1_profiles_path = (\n",
+    "    data_dir / \"cpjump1/cpjump1_compound_concat_profiles.parquet\"\n",
+    ").resolve(strict=True)\n",
+    "\n",
+    "# setting cpjump1 experimental data\n",
+    "cpjump1_experimental_data_path = (\n",
+    "    data_dir / \"cpjump1/CPJUMP1-experimental-metadata.csv\"\n",
+    ").resolve(strict=True)\n",
+    "\n",
+    "\n",
+    "# setting output paths\n",
+    "results_dir = pathlib.Path(\"./results\").resolve()\n",
+    "results_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "# setting cluster output directory\n",
+    "cluster_results_dir = (results_dir / \"clusters\").resolve()\n",
+    "cluster_results_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "# setting pca output results\n",
+    "pca_results_dir = (results_dir / \"pca\").resolve()\n",
+    "pca_results_dir.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "148ee40d",
+   "metadata": {},
+   "source": [
+    "Load the experimental metadata and select only the plates of interest. Here, we focus on plates incubated for 144 hours and filter by cell type. We then extract the plate barcodes corresponding to these conditions. This allows us to analyze only the relevant subset of the dataset for downstream analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94aff5d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load in the experimental data\n",
+    "cpjump1_experimental_data = pl.read_csv(cpjump1_experimental_data_path)\n",
+    "\n",
+    "# Split the dataset by cell type and treatment duration\n",
+    "# Filter U2OS cells (all records)\n",
+    "cpjump1_u2os_exp_metadata = cpjump1_experimental_data.filter(\n",
+    "    pl.col(\"Cell_type\") == \"U2OS\"\n",
+    ")\n",
+    "\n",
+    "# Filter A549 cells with density of 100 for consistency\n",
+    "cpjump1_a549_exp_metadata = cpjump1_experimental_data.filter(\n",
+    "    (pl.col(\"Cell_type\") == \"A549\") & (pl.col(\"Density\") == 100)\n",
+    ")\n",
+    "\n",
+    "# get the plates for each cell type\n",
+    "u20s_plates = cpjump1_u2os_exp_metadata[\"Assay_Plate_Barcode\"].unique().to_list()\n",
+    "a549_plates = cpjump1_a549_exp_metadata[\"Assay_Plate_Barcode\"].unique().to_list()\n",
+    "\n",
+    "# print the plates\n",
+    "print(\"U2OS plates:\", u20s_plates)\n",
+    "print(\"A549 plates:\", a549_plates)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52a1a7da",
+   "metadata": {},
+   "source": [
+    "Next we load in the compound cpjump1 single-cell profiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "445fe34e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load profiles\n",
+    "cpjump1_df = load_profiles(cpjump1_profiles_path)\n",
+    "\n",
+    "# separete metadata and feature columns\n",
+    "cpjump1_meta, cpjump1_feats = split_meta_and_features(cpjump1_df)\n",
+    "\n",
+    "# display\n",
+    "print(\"shape: \", cpjump1_df.shape)\n",
+    "cpjump1_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e8581b8",
+   "metadata": {},
+   "source": [
+    "Convert the single-cell spce into PCA componenets that explains 95% of the variance "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18444dfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cpjump1_u2os_df = apply_pca(\n",
+    "    cpjump1_df.filter(pl.col(\"Metadata_Plate\").is_in(u20s_plates)),\n",
+    "    meta_features=cpjump1_meta,\n",
+    "    morph_features=cpjump1_feats,\n",
+    "    var_explained=0.95,\n",
+    ")\n",
+    "cpjump1_a549_df = apply_pca(\n",
+    "    cpjump1_df.filter(pl.col(\"Metadata_Plate\").is_in(a549_plates)),\n",
+    "    meta_features=cpjump1_meta,\n",
+    "    morph_features=cpjump1_feats,\n",
+    "    var_explained=0.95,\n",
+    ")\n",
+    "\n",
+    "# now get pca_feature names\n",
+    "cpjump1_a549_pca_features = cpjump1_a549_df.drop(cpjump1_meta).columns\n",
+    "cpjump1_u2os_pca_features = cpjump1_u2os_df.drop(cpjump1_meta).columns\n",
+    "\n",
+    "# save pca profiles\n",
+    "cpjump1_u2os_df.write_parquet(pca_results_dir / \"cpjump1_u2os_pca_profiles.parquet\")\n",
+    "cpjump1_a549_df.write_parquet(pca_results_dir / \"cpjump1_a549_pca_profiles.parquet\")\n",
+    "\n",
+    "# print shape of the pca dataframes\n",
+    "print(\"U2OS PCA shape: \", cpjump1_u2os_df.shape)\n",
+    "print(\"A549 PCA shape: \", cpjump1_a549_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64acde6f",
+   "metadata": {},
+   "source": [
+    "Execute optimized clustering for both U2OS cells and A549 cells "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2f48e4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setting log (delete later)\n",
+    "logging.basicConfig(\n",
+    "    level=logging.INFO,\n",
+    "    filename=\"_heterogeneity_clustering.log\",\n",
+    "    format=\"%(asctime)s - %(levelname)s - %(message)s\",\n",
+    ")\n",
+    "\n",
+    "try:\n",
+    "    # Your clustering code here\n",
+    "    logging.info(\"Starting U2OS clustering...\")\n",
+    "\n",
+    "    # U2OS clustering optimization\n",
+    "    u2os_clusters, u2os_params_summary = optimized_clustering(\n",
+    "        cpjump1_u2os_df,\n",
+    "        meta_features=cpjump1_meta,\n",
+    "        morph_features=cpjump1_u2os_pca_features,\n",
+    "        treatment_col=\"Metadata_pert_iname\",\n",
+    "        param_grid=cfret_screen_cluster_param_grid,\n",
+    "        n_trials=60,\n",
+    "        n_jobs=45,\n",
+    "        seed=0,\n",
+    "        study_name=\"cpjump1_u2os_clustering_optimization\",\n",
+    "    )\n",
+    "\n",
+    "    # save cluster labels and parameters summary\n",
+    "    u2os_clusters.write_parquet(results_dir / \"cpjump1_u2os_clusters.parquet\")\n",
+    "    with open(\n",
+    "        cluster_results_dir / \"cpjump1_u2os_clustering_optimization_study_summary.json\",\n",
+    "        \"w\",\n",
+    "    ) as f:\n",
+    "        json.dump(u2os_params_summary, f, indent=4)\n",
+    "    logging.info(\"U2OS clustering complete!\")\n",
+    "\n",
+    "    # A549 clustering optimization\n",
+    "    logging.info(\"Starting A549 clustering...\")\n",
+    "    a549_clusters, a549_params_summary = optimized_clustering(\n",
+    "        cpjump1_a549_df,\n",
+    "        meta_features=cpjump1_meta,\n",
+    "        morph_features=cpjump1_a549_pca_features,\n",
+    "        treatment_col=\"Metadata_pert_iname\",\n",
+    "        param_grid=cfret_screen_cluster_param_grid,\n",
+    "        n_trials=60,\n",
+    "        n_jobs=45,\n",
+    "        seed=0,\n",
+    "        study_name=\"cpjump1_a549_clustering_optimization\",\n",
+    "    )\n",
+    "\n",
+    "    # save cluster labels and parameters summary\n",
+    "    a549_clusters.write_parquet(results_dir / \"cpjump1_a549_clusters.parquet\")\n",
+    "    with open(\n",
+    "        cluster_results_dir / \"cpjump1_a549_clustering_optimization_study_summary.json\",\n",
+    "        \"w\",\n",
+    "    ) as f:\n",
+    "        json.dump(a549_params_summary, f, indent=4)\n",
+    "    logging.info(\"A549 clustering complete!\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    logging.error(f\"Error during clustering: {e}\", exc_info=True)\n",
+    "    raise"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "buscar",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}