Skip to content

Commit

Permalink
Upgrade workflows (openproblems-bio#55)
Browse files Browse the repository at this point in the history
* refactor workflow

* check for nans in python metrics
  • Loading branch information
rcannood authored May 31, 2024
1 parent a90f631 commit 1dcfdba
Show file tree
Hide file tree
Showing 10 changed files with 309 additions and 189 deletions.
9 changes: 9 additions & 0 deletions src/task/metrics/mean_correlation/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@
de_test_X = de_test.layers[par["de_test_layer"]]
prediction_X = prediction.layers[par["prediction_layer"]]

# check nans
if np.isnan(de_test_X).any():
raise ValueError("NaNs in de_test_X")
if np.isnan(prediction_X).any():
# warn and fill with 0s
print("NaNs in prediction_X, filling with zeros", flush=True)
prediction_X = np.nan_to_num(prediction_X)


print("Calculate metrics", flush=True)
mean_pearson = np.mean(
[np.corrcoef(de_test_X[i,], prediction_X[i,])[0, 1] for i in range(de_test_X.shape[0])]
Expand Down
8 changes: 8 additions & 0 deletions src/task/metrics/mean_cosine_sim/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
de_test_X = de_test.layers[par["de_test_layer"]]
prediction_X = prediction.layers[par["prediction_layer"]]

# check nans
if np.isnan(de_test_X).any():
raise ValueError("NaNs in de_test_X")
if np.isnan(prediction_X).any():
# warn and fill with 0s
print("NaNs in prediction_X, filling with zeros", flush=True)
prediction_X = np.nan_to_num(prediction_X)

print("Clipping values", flush=True)
threshold_0001 = -np.log10(0.0001)
de_test_X_clipped_0001 = np.clip(de_test_X, -threshold_0001, threshold_0001)
Expand Down
8 changes: 8 additions & 0 deletions src/task/metrics/mean_rowwise_error/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
de_test_X = de_test.layers[par["de_test_layer"]]
prediction_X = prediction.layers[par["prediction_layer"]]

# check nans
if np.isnan(de_test_X).any():
raise ValueError("NaNs in de_test_X")
if np.isnan(prediction_X).any():
# warn and fill with 0s
print("NaNs in prediction_X, filling with zeros", flush=True)
prediction_X = np.nan_to_num(prediction_X)

print("Clipping values", flush=True)
threshold_0001 = -np.log10(0.0001)
de_test_X_clipped_0001 = np.clip(de_test_X, -threshold_0001, threshold_0001)
Expand Down
2 changes: 1 addition & 1 deletion src/task/process_dataset/bootstrap/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ for (i in seq_len(par$num_replicates)) {
output_test_h5ad <- test_h5ad[, var_ix]

original_dataset_id <- output_train_h5ad$uns[["dataset_id"]]
dataset_id <- paste0(original_dataset_id, "_bootstrap", i)
dataset_id <- paste0(original_dataset_id, "-bootstrap", i)
output_train_h5ad$uns[["dataset_id"]] <- dataset_id
output_test_h5ad$uns[["dataset_id"]] <- dataset_id
output_train_h5ad$uns[["original_dataset_id"]] <- original_dataset_id
Expand Down
33 changes: 33 additions & 0 deletions src/task/process_dataset/generate_id_map/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
functionality:
name: generate_id_map
namespace: process_dataset
info:
type: process_dataset
type_info:
label: Generate ID map
summary: Generate the ID map file for competitors
description: |
This task generates the ID map file for competitors.
arguments:
- name: --de_test_h5ad
type: file
required: true
direction: input
example: resources/neurips-2023-data/de_test.h5ad
- name: --id_map
type: file
required: true
direction: output
example: resources/neurips-2023-data/id_map.csv
resources:
- type: python_script
path: script.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [ anndata ]
- type: nextflow
directives:
label: [ midtime, midmem, lowcpu ]
19 changes: 19 additions & 0 deletions src/task/process_dataset/generate_id_map/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import anndata as ad

## VIASH START
par = {
"de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
"id_map": "resources/neurips-2023-data/id_map.csv",
}
## VIASH END

print(">> Load dataset", flush=True)
input_test = ad.read_h5ad(par["input_test"])

print(">> Generate id_map file", flush=True)
id_map = input_test.obs[["sm_name", "cell_type"]]
id_map.reset_index(drop=True, inplace=True)
id_map.reset_index(names="id", inplace=True)

print(">> Save data", flush=True)
id_map.to_csv(par["output_id_map"], index=False)
2 changes: 1 addition & 1 deletion src/task/workflows/process_dataset/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ functionality:
- name: process_dataset/filter_vars
- name: process_dataset/add_uns_metadata
- name: process_dataset/run_limma
- name: process_dataset/convert_h5ad_to_parquet
- name: process_dataset/generate_id_map
platforms:
- type: nextflow
directives:
Expand Down
15 changes: 3 additions & 12 deletions src/task/workflows/process_dataset/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,13 @@ workflow run_wf {
toState: [de_test_h5ad: "output"]
)

| convert_h5ad_to_parquet.run(
fromState: [
input_train: "de_train_h5ad",
input_test: "de_test_h5ad"
],
toState: [
de_train: "output_train",
de_test: "output_test",
id_map: "output_id_map"
]
| generate_id_map.run(
fromState: [de_test_h5ad: "de_test_h5ad"],
toState: [id_map: "id_map"]
)

| setState([
// "de_train",
"de_train_h5ad",
// "de_test",
"de_test_h5ad",
"id_map"
])
Expand Down
9 changes: 8 additions & 1 deletion src/task/workflows/run_benchmark/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ functionality:
type: string
multiple: true
description: A list of method ids to run. If not specified, all methods will be run.
- name: "--metric_ids"
type: string
multiple: true
description: A list of metric ids to run. If not specified, all metric will be run.
- name: Stability Analysis.
description: Run a stability analysis on the methods.
arguments:
- name: --stability
type: boolean
description: Whether to run a stability analysis on the methods.
Expand Down Expand Up @@ -109,7 +116,7 @@ functionality:
- name: metrics/mean_cosine_sim_r
- name: metrics/mean_correlation_r
- name: process_dataset/bootstrap
- name: process_dataset/convert_h5ad_to_parquet
- name: process_dataset/generate_id_map
repositories:
- name: openproblemsv2
type: github
Expand Down
Loading

0 comments on commit 1dcfdba

Please sign in to comment.