Upgrade workflows (openproblems-bio#55)

* refactor workflow * check for nans in python metrics
Paulos2411 · May 31, 2024 · 1dcfdba · 1dcfdba
1 parent a90f631
commit 1dcfdba
Show file tree

Hide file tree

Showing 10 changed files with 309 additions and 189 deletions.
diff --git a/src/task/metrics/mean_correlation/script.py b/src/task/metrics/mean_correlation/script.py
@@ -30,6 +30,15 @@
 de_test_X = de_test.layers[par["de_test_layer"]]
 prediction_X = prediction.layers[par["prediction_layer"]]
 
+# check nans
+if np.isnan(de_test_X).any():
+    raise ValueError("NaNs in de_test_X")
+if np.isnan(prediction_X).any():
+    # warn and fill with 0s
+    print("NaNs in prediction_X, filling with zeros", flush=True)
+    prediction_X = np.nan_to_num(prediction_X)
+
+
 print("Calculate metrics", flush=True)
 mean_pearson = np.mean(
     [np.corrcoef(de_test_X[i,], prediction_X[i,])[0, 1] for i in range(de_test_X.shape[0])]

diff --git a/src/task/metrics/mean_cosine_sim/script.py b/src/task/metrics/mean_cosine_sim/script.py
@@ -30,6 +30,14 @@
 de_test_X = de_test.layers[par["de_test_layer"]]
 prediction_X = prediction.layers[par["prediction_layer"]]
 
+# check nans
+if np.isnan(de_test_X).any():
+    raise ValueError("NaNs in de_test_X")
+if np.isnan(prediction_X).any():
+    # warn and fill with 0s
+    print("NaNs in prediction_X, filling with zeros", flush=True)
+    prediction_X = np.nan_to_num(prediction_X)
+
 print("Clipping values", flush=True)
 threshold_0001 = -np.log10(0.0001)
 de_test_X_clipped_0001 = np.clip(de_test_X, -threshold_0001, threshold_0001)

diff --git a/src/task/metrics/mean_rowwise_error/script.py b/src/task/metrics/mean_rowwise_error/script.py
@@ -30,6 +30,14 @@
 de_test_X = de_test.layers[par["de_test_layer"]]
 prediction_X = prediction.layers[par["prediction_layer"]]
 
+# check nans
+if np.isnan(de_test_X).any():
+    raise ValueError("NaNs in de_test_X")
+if np.isnan(prediction_X).any():
+    # warn and fill with 0s
+    print("NaNs in prediction_X, filling with zeros", flush=True)
+    prediction_X = np.nan_to_num(prediction_X)
+
 print("Clipping values", flush=True)
 threshold_0001 = -np.log10(0.0001)
 de_test_X_clipped_0001 = np.clip(de_test_X, -threshold_0001, threshold_0001)

diff --git a/src/task/process_dataset/bootstrap/script.R b/src/task/process_dataset/bootstrap/script.R
@@ -40,7 +40,7 @@ for (i in seq_len(par$num_replicates)) {
   output_test_h5ad <- test_h5ad[, var_ix]
 
   original_dataset_id <- output_train_h5ad$uns[["dataset_id"]]
-  dataset_id <- paste0(original_dataset_id, "_bootstrap", i)
+  dataset_id <- paste0(original_dataset_id, "-bootstrap", i)
   output_train_h5ad$uns[["dataset_id"]] <- dataset_id
   output_test_h5ad$uns[["dataset_id"]] <- dataset_id
   output_train_h5ad$uns[["original_dataset_id"]] <- original_dataset_id

diff --git a/src/task/process_dataset/generate_id_map/config.vsh.yaml b/src/task/process_dataset/generate_id_map/config.vsh.yaml
@@ -0,0 +1,33 @@
+functionality:
+  name: generate_id_map
+  namespace: process_dataset
+  info:
+    type: process_dataset
+    type_info:
+      label: Generate ID map
+      summary: Generate the ID map file for competitors
+      description: |
+        This task generates the ID map file for competitors.
+  arguments:
+    - name: --de_test_h5ad
+      type: file
+      required: true
+      direction: input
+      example: resources/neurips-2023-data/de_test.h5ad
+    - name: --id_map
+      type: file
+      required: true
+      direction: output
+      example: resources/neurips-2023-data/id_map.csv
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages: [ anndata ]
+  - type: nextflow
+    directives:
+      label: [ midtime, midmem, lowcpu ]
diff --git a/src/task/process_dataset/generate_id_map/script.py b/src/task/process_dataset/generate_id_map/script.py
@@ -0,0 +1,19 @@
+import anndata as ad
+
+## VIASH START
+par = {
+    "de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
+    "id_map": "resources/neurips-2023-data/id_map.csv",
+}
+## VIASH END
+
+print(">> Load dataset", flush=True)
+input_test = ad.read_h5ad(par["input_test"])
+
+print(">> Generate id_map file", flush=True)
+id_map = input_test.obs[["sm_name", "cell_type"]]
+id_map.reset_index(drop=True, inplace=True)
+id_map.reset_index(names="id", inplace=True)
+
+print(">> Save data", flush=True)
+id_map.to_csv(par["output_id_map"], index=False)
diff --git a/src/task/workflows/process_dataset/config.vsh.yaml b/src/task/workflows/process_dataset/config.vsh.yaml
@@ -41,7 +41,7 @@ functionality:
     - name: process_dataset/filter_vars
     - name: process_dataset/add_uns_metadata
     - name: process_dataset/run_limma
-    - name: process_dataset/convert_h5ad_to_parquet
+    - name: process_dataset/generate_id_map
 platforms:
   - type: nextflow
     directives:

diff --git a/src/task/workflows/process_dataset/main.nf b/src/task/workflows/process_dataset/main.nf
@@ -58,22 +58,13 @@ workflow run_wf {
       toState: [de_test_h5ad: "output"]
     )
 
-    | convert_h5ad_to_parquet.run(
-      fromState: [
-        input_train: "de_train_h5ad",
-        input_test: "de_test_h5ad"
-      ],
-      toState: [
-        de_train: "output_train",
-        de_test: "output_test",
-        id_map: "output_id_map"
-      ]
+    | generate_id_map.run(
+      fromState: [de_test_h5ad: "de_test_h5ad"],
+      toState: [id_map: "id_map"]
     )
 
     | setState([
-      // "de_train",
       "de_train_h5ad",
-      // "de_test",
       "de_test_h5ad",
       "id_map"
     ])

diff --git a/src/task/workflows/run_benchmark/config.vsh.yaml b/src/task/workflows/run_benchmark/config.vsh.yaml
@@ -62,6 +62,13 @@ functionality:
           type: string
           multiple: true
           description: A list of method ids to run. If not specified, all methods will be run.
+        - name: "--metric_ids"
+          type: string
+          multiple: true
+          description: A list of metric ids to run. If not specified, all metric will be run.
+    - name: Stability Analysis.
+      description: Run a stability analysis on the methods.
+      arguments:
         - name: --stability
           type: boolean
           description: Whether to run a stability analysis on the methods.
@@ -109,7 +116,7 @@ functionality:
     - name: metrics/mean_cosine_sim_r
     - name: metrics/mean_correlation_r
     - name: process_dataset/bootstrap
-    - name: process_dataset/convert_h5ad_to_parquet
+    - name: process_dataset/generate_id_map
   repositories:
     - name: openproblemsv2
       type: github