Update_scripts (openproblems-bio#52)

* update * fix labels * add back epoch logging * add message * revert multigpu changes in LGC_ensemble * simply tower config * fix scripts * use mean_across_compounds as test data * implement alternative implementations of the metrics in R * fix descriptions, add alternative metrics in wf * apply patch to fix lgc_ensemble on bootstrapped data Co-authored-by: ttunja <60556758+ttunja@users.noreply.github.com> * fix component names * fix metric ids * fill std nas in transformer_ensemble * fix resource * fix method id --------- Co-authored-by: ttunja <60556758+ttunja@users.noreply.github.com>
Paulos2411 · May 30, 2024 · 22fc360 · 22fc360
1 parent 1f6afe2
commit 22fc360
Show file tree

Hide file tree

Showing 29 changed files with 454 additions and 110 deletions.
diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
@@ -26,12 +26,6 @@ viash test src/task/methods/$method_id/config.vsh.yaml
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   ---setup cachedbuild ---verbose
 
-# run the method (using parquet as input)
-viash run src/task/methods/$method_id/config.vsh.yaml -- \
-  --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
-  --id_map "resources/neurips-2023-kaggle/id_map.csv" \
-  --output "output/prediction.h5ad"
-
 # run the method (using h5ad as input)
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   --de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
@@ -40,7 +34,7 @@ viash run src/task/methods/$method_id/config.vsh.yaml -- \
 
 # run evaluation metric
 viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
-  --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
+  --de_test_h5ad "resources/neurips-2023-kaggle/de_test.h5ad" \
   --prediction "output/prediction.h5ad" \
   --output "output/score.h5ad"
 

diff --git a/scripts/generate_resources.sh b/scripts/generate_resources.sh
@@ -32,7 +32,7 @@ nextflow run \
   --publish_dir "$OUT"
 
 echo ">> Run method"
-viash run src/task/control_methods/sample/config.vsh.yaml -- \
+viash run src/task/control_methods/mean_across_compounds/config.vsh.yaml -- \
   --de_train_h5ad "$OUT/de_train.h5ad" \
   --de_test_h5ad "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \

diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
diff --git a/scripts/run_stability_test.sh b/scripts/run_stability_test.sh
diff --git a/scripts/run_stability_benchmark_tw.sh → scripts/run_stability_tw.sh b/scripts/run_stability_benchmark_tw.sh → scripts/run_stability_tw.sh
@@ -5,15 +5,15 @@ publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/result
 
 cat > /tmp/params.yaml << HERE
 id: dge_perturbation_task
-input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/**/state.yaml
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-rename_keys: "de_train:de_train,de_train_h5ad:de_train_h5ad,de_test:de_test,de_test_h5ad:de_test_h5ad,id_map:id_map"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
 settings: '{"stability": true, "stability_obs_fraction": 0.99, "stability_var_fraction": 0.99}'
 HERE
 
 tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
-  --revision add_bootstrapping_build \
+  --revision main_build \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \

diff --git a/scripts/run_tval_tw.sh b/scripts/run_tval_tw.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+RUN_ID="tval_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+id: dge_perturbation_task
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
+settings: '{"layer": "t"}'
+HERE
+
+tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
+  --revision add_cell_obs_to_uns_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/sync_results.sh b/scripts/sync_results.sh
@@ -4,3 +4,8 @@ aws s3 sync \
   s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
   output/benchmark_results/ \
   --delete --dryrun
+
+aws s3 sync \
+  output/benchmark_results/ \
+  s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
+  --delete --dryrun
diff --git a/src/common/nextflow_helpers/labels_tw.config b/src/common/nextflow_helpers/labels_tw.config
@@ -30,33 +30,28 @@ process {
     disk = { 400.GB * task.attempt } 
   }
   withLabel: lowsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" }
   }
   withLabel: midsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" }
   }
   withLabel: highsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" }
   }
   withLabel: gpu {
-    cpus = 16
+    memory = 100.GB
+    cpus = 20
     accelerator = 1
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   withLabel: midgpu { // aiming for g4dn.12xlarge
     memory = 150.GB
     cpus = 40
     accelerator = 4
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   withLabel: highgpu { // aiming for g4dn.metal
     memory = 300.GB
     cpus = 80
     accelerator = 8
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   // make sure publishstates gets enough disk space and memory
   withName:'.*publishStatesProc' {

diff --git a/src/task/methods/lgc_ensemble_helpers/helper_functions.py b/src/task/methods/lgc_ensemble_helpers/helper_functions.py
@@ -210,8 +210,8 @@ def train_function(model, model_name, x_train, y_train, x_val, y_val, info_data,
         if val_mrrmse < best_loss:
             best_loss = val_mrrmse
             best_weights = model.state_dict()
-            # print('BEST ----> ')
-        # print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
+            print('BEST ----> ')
+        print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
     t1 = time.time()
     results['runtime'] = float(t1-t0)
     model.load_state_dict(best_weights)
@@ -230,19 +230,12 @@ def cross_validate_models(X, y, kf_cv, cell_types_sm_names, paths, config=None,
                     'val_sm_name': cell_types_sm_names.iloc[val_idx]['sm_name'].tolist()}
         for Model in [LSTM, Conv, GRU]:
             model = Model(scheme, X.shape, y.shape)
-
-            if torch.cuda.device_count() > 1:
-                model = nn.DataParallel(model)
-                model_name = model.module.name
-            else:
-                model_name = model.name
-
-            model, results = train_function(model, model_name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
+            model, results = train_function(model, model.name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
             model.to('cpu')
             trained_models.append(model)
-            print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
-            torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
-            with open(f'{paths["logs_dir"]}/{model_name}_{scheme}_fold{i}.json', 'w') as file:
+            print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
+            torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
+            with open(f'{paths["logs_dir"]}/{model.name}_{scheme}_fold{i}.json', 'w') as file:
                 json.dump(results, file)
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()

diff --git a/src/task/methods/lgc_ensemble_helpers/prepare_data.py b/src/task/methods/lgc_ensemble_helpers/prepare_data.py
@@ -19,6 +19,7 @@ def prepare_data(par, paths):
     mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
     std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
     std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+    std_sm_name_filled = std_sm_name.fillna(0)
     cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
     quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\
     .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1)
@@ -30,7 +31,7 @@ def prepare_data(par, paths):
     mean_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
     std_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv', index=False)
     mean_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
-    std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
+    std_sm_name_filled.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
     quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
     ## Create one hot encoding features
     one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"])

diff --git a/src/task/methods/lgc_ensemble_predict/script.py b/src/task/methods/lgc_ensemble_predict/script.py
@@ -145,13 +145,14 @@
 df_sub.reset_index(drop=True, inplace=True)
 
 # write output
+method_id = meta["functionality_name"].replace("_predict", "")
 output = ad.AnnData(
     layers={"prediction": df_sub.to_numpy()},
     obs=pd.DataFrame(index=id_map["id"]),
     var=pd.DataFrame(index=gene_names),
     uns={
         "dataset_id": train_config["DATASET_ID"],
-        "method_id": meta["functionality_name"]
+        "method_id": method_id
     }
 )
 print(output)

diff --git a/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml b/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
@@ -79,4 +79,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
+      label: [hightime, veryhighmem, highcpu]
diff --git a/src/task/methods/lgc_ensemble_prepare/script.py b/src/task/methods/lgc_ensemble_prepare/script.py
@@ -60,6 +60,7 @@
 mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
 std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
 std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+std_sm_name_filled = std_sm_name.fillna(0)
 cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
 quantiles_cell_type = pd.concat(
     [pd.DataFrame(cell_types)] +
@@ -74,7 +75,7 @@
 mean_cell_type.to_csv(f'{par["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
 std_cell_type.to_csv(f'{par["train_data_aug_dir"]}/std_cell_type.csv', index=False)
 mean_sm_name.to_csv(f'{par["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
-std_sm_name.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
+std_sm_name_filled.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
 quantiles_cell_type.to_csv(f'{par["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
 with open(f'{par["train_data_aug_dir"]}/gene_names.json', 'w') as f:
     json.dump(gene_names, f)

diff --git a/src/task/methods/lgc_ensemble_train/config.vsh.yaml b/src/task/methods/lgc_ensemble_train/config.vsh.yaml
@@ -69,4 +69,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [hightime, veryhighmem, highcpu]
+      label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
diff --git a/src/task/methods/lgc_ensemble_train/script.py b/src/task/methods/lgc_ensemble_train/script.py
@@ -3,6 +3,10 @@
 import json
 import numpy as np
 import pandas as pd
+if torch.cuda.is_available():
+    print("using device: cuda", flush=True)
+else:
+    print('using device: cpu', flush=True)
 
 ## VIASH START
 par = {

diff --git a/src/task/methods/transformer_ensemble/utils.py b/src/task/methods/transformer_ensemble/utils.py
@@ -47,10 +47,10 @@ def prepare_augmented_data(
     de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
 
     mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
-    std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
+    std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0)
 
     mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
-    std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+    std_sm_name = de_sm_name.groupby('sm_name').std().reset_index().fillna(0)
 
     # Append mean and std for 'cell_type'
     rows = []

diff --git a/src/task/metrics/mean_correlation/config.vsh.yaml b/src/task/metrics/mean_correlation/config.vsh.yaml
@@ -7,7 +7,13 @@ functionality:
         label: Mean Pearson
         summary: The mean of Pearson correlations per row (perturbation).
         description: |
-          We use the **Mean Pearson Correlation** to score submissions.
+          The **Mean Pearson Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
         repository_url: null
         documentation_url: null
         min: -1
@@ -17,7 +23,13 @@ functionality:
         label: Mean Spearman
         summary: The mean of Spearman correlations per row (perturbation).
         description: |
-          We use the **Mean Spearman Correlation** to score submissions.
+          The **Mean Spearman Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
         repository_url: null
         documentation_url: null
         min: -1
@@ -29,9 +41,6 @@ functionality:
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages: [ fastparquet ]
   - type: nextflow
     directives:
       label: [ midtime, highmem, highcpu ]
diff --git a/src/task/metrics/mean_correlation_r/config.vsh.yaml b/src/task/metrics/mean_correlation_r/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: mean_correlation_r
+  info:
+    metrics:
+      - name: mean_pearson_r
+        label: Mean Pearson bis
+        summary: The mean of Pearson correlations per row (perturbation).
+        description: |
+          The **Mean Pearson Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+      - name: mean_spearman_r
+        label: Mean Spearman bis
+        summary: The mean of Spearman correlations per row (perturbation).
+        description: |
+          The **Mean Spearman Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.4
+  - type: nextflow
+    directives:
+      label: [ midtime, highmem, highcpu ]