diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
index 5f035343..fe07dfdc 100755
--- a/scripts/add_a_method.sh
+++ b/scripts/add_a_method.sh
@@ -26,12 +26,6 @@ viash test src/task/methods/$method_id/config.vsh.yaml
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   ---setup cachedbuild ---verbose
 
-# run the method (using parquet as input)
-viash run src/task/methods/$method_id/config.vsh.yaml -- \
-  --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
-  --id_map "resources/neurips-2023-kaggle/id_map.csv" \
-  --output "output/prediction.h5ad"
-
 # run the method (using h5ad as input)
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   --de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
@@ -40,7 +34,7 @@ viash run src/task/methods/$method_id/config.vsh.yaml -- \
 
 # run evaluation metric
 viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
-  --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
+  --de_test_h5ad "resources/neurips-2023-kaggle/de_test.h5ad" \
   --prediction "output/prediction.h5ad" \
   --output "output/score.h5ad"
 
diff --git a/scripts/generate_resources.sh b/scripts/generate_resources.sh
index 8493326a..72137f85 100755
--- a/scripts/generate_resources.sh
+++ b/scripts/generate_resources.sh
@@ -32,7 +32,7 @@ nextflow run \
   --publish_dir "$OUT"
 
 echo ">> Run method"
-viash run src/task/control_methods/sample/config.vsh.yaml -- \
+viash run src/task/control_methods/mean_across_compounds/config.vsh.yaml -- \
   --de_train_h5ad "$OUT/de_train.h5ad" \
   --de_test_h5ad "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
deleted file mode 100755
index 7d42d71a..00000000
--- a/scripts/run_benchmark.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-set -e
-
-IN="resources"
-OUT="output"
-
-[[ ! -d "$OUT" ]] && mkdir -p "$OUT"
-
-# run benchmark
-# 'input_states' looks for state.yaml files corresponding to datasets
-export NXF_VER=23.04.2
-
-nextflow run . \
-  -main-script target/nextflow/workflows/run_benchmark/main.nf \
-  -profile docker \
-  -resume \
-  --publish_dir "$OUT" \
-  --output_state "state.yaml" \
-  -entry auto \
-  --input_states "$IN/**/state.yaml"
diff --git a/scripts/run_stability_test.sh b/scripts/run_stability_test.sh
deleted file mode 100755
index aff85076..00000000
--- a/scripts/run_stability_test.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-IN="resources"
-OUT="output/test"
-
-[[ ! -d "$OUT" ]] && mkdir -p "$OUT"
-
-# run benchmark
-# 'input_states' looks for state.yaml files corresponding to datasets
-export NXF_VER=23.04.2
-
-nextflow run . \
-  -main-script target/nextflow/workflows/run_benchmark/main.nf \
-  -profile docker \
-  -resume \
-  --publish_dir "$OUT" \
-  -entry auto \
-  --input_states "$IN/**/state.yaml" \
-  --rename_keys 'de_train:de_train,de_train_h5ad:de_train_h5ad,de_test:de_test,de_test_h5ad:de_test_h5ad,id_map:id_map' \
-  --settings '{"stability": true, "stability_num_replicates": 3, "stability_obs_fraction": 0.99, "stability_var_fraction": 0.99, "method_ids": ["zeros", "sample", "ground_truth", "mean_across_types", "mean_across_compounds"]}' \
-  --output_state "state.yaml"
\ No newline at end of file
diff --git a/scripts/run_stability_benchmark_tw.sh b/scripts/run_stability_tw.sh
similarity index 80%
rename from scripts/run_stability_benchmark_tw.sh
rename to scripts/run_stability_tw.sh
index 87dbf19b..536496ce 100755
--- a/scripts/run_stability_benchmark_tw.sh
+++ b/scripts/run_stability_tw.sh
@@ -5,15 +5,15 @@ publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/result
 
 cat > /tmp/params.yaml << HERE
 id: dge_perturbation_task
-input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/**/state.yaml
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-rename_keys: "de_train:de_train,de_train_h5ad:de_train_h5ad,de_test:de_test,de_test_h5ad:de_test_h5ad,id_map:id_map"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
 settings: '{"stability": true, "stability_obs_fraction": 0.99, "stability_var_fraction": 0.99}'
 HERE
 
 tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
-  --revision add_bootstrapping_build \
+  --revision main_build \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
diff --git a/scripts/run_tval_tw.sh b/scripts/run_tval_tw.sh
new file mode 100755
index 00000000..1c875a3e
--- /dev/null
+++ b/scripts/run_tval_tw.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+RUN_ID="tval_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+id: dge_perturbation_task
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
+settings: '{"layer": "t"}'
+HERE
+
+tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
+  --revision add_cell_obs_to_uns_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/sync_results.sh b/scripts/sync_results.sh
index 0b408e5c..04405114 100755
--- a/scripts/sync_results.sh
+++ b/scripts/sync_results.sh
@@ -4,3 +4,8 @@ aws s3 sync \
   s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
   output/benchmark_results/ \
   --delete --dryrun
+
+aws s3 sync \
+  output/benchmark_results/ \
+  s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
+  --delete --dryrun
\ No newline at end of file
diff --git a/src/common/nextflow_helpers/labels_tw.config b/src/common/nextflow_helpers/labels_tw.config
index 1869a98c..21f73426 100644
--- a/src/common/nextflow_helpers/labels_tw.config
+++ b/src/common/nextflow_helpers/labels_tw.config
@@ -30,33 +30,28 @@ process {
     disk = { 400.GB * task.attempt } 
   }
   withLabel: lowsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" }
   }
   withLabel: midsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" }
   }
   withLabel: highsharedmem {
-    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
+    containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" }
   }
   withLabel: gpu {
-    cpus = 16
+    memory = 100.GB
+    cpus = 20
     accelerator = 1
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   withLabel: midgpu { // aiming for g4dn.12xlarge
     memory = 150.GB
     cpus = 40
     accelerator = 4
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   withLabel: highgpu { // aiming for g4dn.metal
     memory = 300.GB
     cpus = 80
     accelerator = 8
-    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
-       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
   // make sure publishstates gets enough disk space and memory
   withName:'.*publishStatesProc' {
diff --git a/src/task/methods/lgc_ensemble_helpers/helper_functions.py b/src/task/methods/lgc_ensemble_helpers/helper_functions.py
index 7766d072..68347b11 100644
--- a/src/task/methods/lgc_ensemble_helpers/helper_functions.py
+++ b/src/task/methods/lgc_ensemble_helpers/helper_functions.py
@@ -210,8 +210,8 @@ def train_function(model, model_name, x_train, y_train, x_val, y_val, info_data,
         if val_mrrmse < best_loss:
             best_loss = val_mrrmse
             best_weights = model.state_dict()
-            # print('BEST ----> ')
-        # print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
+            print('BEST ----> ')
+        print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
     t1 = time.time()
     results['runtime'] = float(t1-t0)
     model.load_state_dict(best_weights)
@@ -230,19 +230,12 @@ def cross_validate_models(X, y, kf_cv, cell_types_sm_names, paths, config=None,
                     'val_sm_name': cell_types_sm_names.iloc[val_idx]['sm_name'].tolist()}
         for Model in [LSTM, Conv, GRU]:
             model = Model(scheme, X.shape, y.shape)
-            
-            if torch.cuda.device_count() > 1:
-                model = nn.DataParallel(model)
-                model_name = model.module.name
-            else:
-                model_name = model.name
-
-            model, results = train_function(model, model_name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
+            model, results = train_function(model, model.name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
             model.to('cpu')
             trained_models.append(model)
-            print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
-            torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
-            with open(f'{paths["logs_dir"]}/{model_name}_{scheme}_fold{i}.json', 'w') as file:
+            print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
+            torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
+            with open(f'{paths["logs_dir"]}/{model.name}_{scheme}_fold{i}.json', 'w') as file:
                 json.dump(results, file)
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
diff --git a/src/task/methods/lgc_ensemble_helpers/prepare_data.py b/src/task/methods/lgc_ensemble_helpers/prepare_data.py
index 4d948210..481f0a63 100644
--- a/src/task/methods/lgc_ensemble_helpers/prepare_data.py
+++ b/src/task/methods/lgc_ensemble_helpers/prepare_data.py
@@ -19,6 +19,7 @@ def prepare_data(par, paths):
     mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
     std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
     std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+    std_sm_name_filled = std_sm_name.fillna(0)
     cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
     quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\
     .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1)
@@ -30,7 +31,7 @@ def prepare_data(par, paths):
     mean_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
     std_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv', index=False)
     mean_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
-    std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
+    std_sm_name_filled.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
     quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
     ## Create one hot encoding features
     one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"])
diff --git a/src/task/methods/lgc_ensemble_predict/script.py b/src/task/methods/lgc_ensemble_predict/script.py
index fe5de1e8..bb45df76 100644
--- a/src/task/methods/lgc_ensemble_predict/script.py
+++ b/src/task/methods/lgc_ensemble_predict/script.py
@@ -145,13 +145,14 @@
 df_sub.reset_index(drop=True, inplace=True)
 
 # write output
+method_id = meta["functionality_name"].replace("_predict", "")
 output = ad.AnnData(
     layers={"prediction": df_sub.to_numpy()},
     obs=pd.DataFrame(index=id_map["id"]),
     var=pd.DataFrame(index=gene_names),
     uns={
         "dataset_id": train_config["DATASET_ID"],
-        "method_id": meta["functionality_name"]
+        "method_id": method_id
     }
 )
 print(output)
diff --git a/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml b/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
index 26259733..25d8741b 100644
--- a/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
+++ b/src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
@@ -79,4 +79,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
+      label: [hightime, veryhighmem, highcpu]
diff --git a/src/task/methods/lgc_ensemble_prepare/script.py b/src/task/methods/lgc_ensemble_prepare/script.py
index c4fd164c..53f68309 100644
--- a/src/task/methods/lgc_ensemble_prepare/script.py
+++ b/src/task/methods/lgc_ensemble_prepare/script.py
@@ -60,6 +60,7 @@
 mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
 std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
 std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+std_sm_name_filled = std_sm_name.fillna(0)
 cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
 quantiles_cell_type = pd.concat(
     [pd.DataFrame(cell_types)] +
@@ -74,7 +75,7 @@
 mean_cell_type.to_csv(f'{par["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
 std_cell_type.to_csv(f'{par["train_data_aug_dir"]}/std_cell_type.csv', index=False)
 mean_sm_name.to_csv(f'{par["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
-std_sm_name.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
+std_sm_name_filled.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
 quantiles_cell_type.to_csv(f'{par["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
 with open(f'{par["train_data_aug_dir"]}/gene_names.json', 'w') as f:
     json.dump(gene_names, f)
diff --git a/src/task/methods/lgc_ensemble_train/config.vsh.yaml b/src/task/methods/lgc_ensemble_train/config.vsh.yaml
index 6a4db6b3..bb64ac65 100644
--- a/src/task/methods/lgc_ensemble_train/config.vsh.yaml
+++ b/src/task/methods/lgc_ensemble_train/config.vsh.yaml
@@ -69,4 +69,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [hightime, veryhighmem, highcpu]
+      label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
diff --git a/src/task/methods/lgc_ensemble_train/script.py b/src/task/methods/lgc_ensemble_train/script.py
index b46d9407..fa557221 100644
--- a/src/task/methods/lgc_ensemble_train/script.py
+++ b/src/task/methods/lgc_ensemble_train/script.py
@@ -3,6 +3,10 @@
 import json
 import numpy as np
 import pandas as pd
+if torch.cuda.is_available():
+    print("using device: cuda", flush=True)
+else:
+    print('using device: cpu', flush=True)
 
 ## VIASH START
 par = {
diff --git a/src/task/methods/transformer_ensemble/utils.py b/src/task/methods/transformer_ensemble/utils.py
index 51e33598..d5beea93 100644
--- a/src/task/methods/transformer_ensemble/utils.py
+++ b/src/task/methods/transformer_ensemble/utils.py
@@ -47,10 +47,10 @@ def prepare_augmented_data(
     de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
 
     mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
-    std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
+    std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0)
 
     mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
-    std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
+    std_sm_name = de_sm_name.groupby('sm_name').std().reset_index().fillna(0)
 
     # Append mean and std for 'cell_type'
     rows = []
diff --git a/src/task/metrics/mean_correlation/config.vsh.yaml b/src/task/metrics/mean_correlation/config.vsh.yaml
index 38a5d91f..2c198d0f 100644
--- a/src/task/metrics/mean_correlation/config.vsh.yaml
+++ b/src/task/metrics/mean_correlation/config.vsh.yaml
@@ -7,7 +7,13 @@ functionality:
         label: Mean Pearson
         summary: The mean of Pearson correlations per row (perturbation).
         description: |
-          We use the **Mean Pearson Correlation** to score submissions.
+          The **Mean Pearson Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
         repository_url: null
         documentation_url: null
         min: -1
@@ -17,7 +23,13 @@ functionality:
         label: Mean Spearman
         summary: The mean of Spearman correlations per row (perturbation).
         description: |
-          We use the **Mean Spearman Correlation** to score submissions.
+          The **Mean Spearman Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
         repository_url: null
         documentation_url: null
         min: -1
@@ -29,9 +41,6 @@ functionality:
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages: [ fastparquet ]
   - type: nextflow
     directives:
       label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_correlation_r/config.vsh.yaml b/src/task/metrics/mean_correlation_r/config.vsh.yaml
new file mode 100644
index 00000000..0981a39c
--- /dev/null
+++ b/src/task/metrics/mean_correlation_r/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: mean_correlation_r
+  info:
+    metrics:
+      - name: mean_pearson_r
+        label: Mean Pearson bis
+        summary: The mean of Pearson correlations per row (perturbation).
+        description: |
+          The **Mean Pearson Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+      - name: mean_spearman_r
+        label: Mean Spearman bis
+        summary: The mean of Spearman correlations per row (perturbation).
+        description: |
+          The **Mean Spearman Correlation** is computed as follows:
+
+          $$
+          \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.4
+  - type: nextflow
+    directives:
+      label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_correlation_r/script.R b/src/task/metrics/mean_correlation_r/script.R
new file mode 100644
index 00000000..a53e1b61
--- /dev/null
+++ b/src/task/metrics/mean_correlation_r/script.R
@@ -0,0 +1,54 @@
+library(anndata)
+
+## VIASH START
+par <- list(
+  de_test_h5ad = "resources/neurips-2023-data/de_test.h5ad",
+  de_test_layer = "sign_log10_pval",
+  prediction = "resources/neurips-2023-data/prediction.h5ad",
+  prediction_layer = "prediction",
+  resolve_genes = "de_test",
+  output = "output.h5ad"
+)
+## VIASH END
+
+cat("Load data\n")
+de_test <- read_h5ad(par$de_test_h5ad)
+cat("de_test: "); print(de_test)
+prediction <- read_h5ad(par$prediction)
+cat("prediction: "); print(prediction)
+
+cat("Resolve genes\n")
+genes <-
+  if (par$resolve_genes == "de_test") {
+    de_test$var_names
+  } else if (par$resolve_genes == "intersection") {
+    intersect(de_test$var_names, prediction$var_names)
+  }
+de_test <- de_test[, genes]
+prediction <- prediction[, genes]
+
+de_test_X <- de_test$layers[[par$de_test_layer]]
+prediction_X <- prediction$layers[[par$prediction_layer]]
+
+cat("Calculate metrics\n")
+out <- cor(t(de_test_X), t(prediction_X), method = "pearson")
+pearson <- diag(out)
+mean_pearson <- mean(ifelse(is.na(pearson), 0, pearson))
+
+out2 <- cor(t(de_test_X), t(prediction_X), method = "spearman")
+spearman <- diag(out2)
+mean_spearman <- mean(ifelse(is.na(spearman), 0, spearman))
+
+cat("Create output\n")
+output <- AnnData(
+  shape = c(0L, 0L),
+  uns = list(
+    dataset_id = de_test$uns[["dataset_id"]],
+    method_id = prediction$uns[["method_id"]],
+    metric_ids = c("mean_pearson_r", "mean_spearman_r"),
+    metric_values = c(mean_pearson, mean_spearman)
+  )
+)
+
+cat("Write output\n")
+output$write_h5ad(par$output, compression = "gzip")
diff --git a/src/task/metrics/mean_cosine_sim/config.vsh.yaml b/src/task/metrics/mean_cosine_sim/config.vsh.yaml
index 41a5ac42..12b67bd7 100644
--- a/src/task/metrics/mean_cosine_sim/config.vsh.yaml
+++ b/src/task/metrics/mean_cosine_sim/config.vsh.yaml
@@ -7,10 +7,10 @@ functionality:
         label: Mean Cosine Similarity
         summary: The mean of cosine similarities per row (perturbation).
         description: |
-          We use the **Mean Cosine Similarity** to score submissions, computed as follows:
+          The **Mean Cosine Similarity** is computed as follows:
 
           $$
-          \textrm{Mean-Cosine} = \frac{1}{R} \sum_{i=1}^R \frac{\mathbf{y}_i \cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|}
+          \textrm{Mean-Cosine} = \frac{1}{R}\sum_{i=1}^R\frac{\mathbf{y}_i\cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|}
           $$
 
           where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
@@ -34,9 +34,6 @@ functionality:
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages: [ fastparquet ]
   - type: nextflow
     directives:
       label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_cosine_sim_r/config.vsh.yaml b/src/task/metrics/mean_cosine_sim_r/config.vsh.yaml
new file mode 100644
index 00000000..396a0c48
--- /dev/null
+++ b/src/task/metrics/mean_cosine_sim_r/config.vsh.yaml
@@ -0,0 +1,42 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: mean_cosine_sim_r
+  info:
+    metrics:
+      - name: mean_cosine_sim_r
+        label: Mean Cosine Similarity bis
+        summary: The mean of cosine similarities per row (perturbation).
+        description: |
+          The **Mean Cosine Similarity** is computed as follows:
+
+          $$
+          \textrm{Mean-Cosine} = \frac{1}{R}\sum_{i=1}^R\frac{\mathbf{y}_i\cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+      - name: mean_cosine_sim_clipped_0001_r
+        label: Mean Cosine Similarity clipped at 0.0001 bis
+        summary: The mean of cosine similarities per row (perturbation). Values are clipped to 0.0001 adjusted p-values.
+        description: This metric is the same as `mean_cosine_sim`, but with the values clipped to [-log10(0.0001), log10(0.0001)].
+        repository_url: null
+        documentation_url: null
+        min: -1
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.4
+    setup:
+      - type: r
+        packages: proxyC
+  - type: nextflow
+    directives:
+      label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_cosine_sim_r/script.R b/src/task/metrics/mean_cosine_sim_r/script.R
new file mode 100644
index 00000000..870aa660
--- /dev/null
+++ b/src/task/metrics/mean_cosine_sim_r/script.R
@@ -0,0 +1,57 @@
+library(anndata)
+
+## VIASH START (unchanged)
+par <- list(
+  de_test_h5ad = "resources/neurips-2023-data/de_test.h5ad",
+  de_test_layer = "sign_log10_pval",
+  prediction = "resources/neurips-2023-data/prediction.h5ad",
+  prediction_layer = "prediction",
+  resolve_genes = "de_test",
+  output = "output.h5ad"
+)
+## VIASH END
+
+cat("Load data\n")
+de_test <- read_h5ad(par$de_test_h5ad)
+cat("de_test: "); print(de_test)
+prediction <- read_h5ad(par$prediction)
+cat("prediction: "); print(prediction)
+
+cat("Resolve genes\n")
+genes <-
+  if (par$resolve_genes == "de_test") {
+    de_test$var_names
+  } else if (par$resolve_genes == "intersection") {
+    intersect(de_test$var_names, prediction$var_names)
+  }
+de_test <- de_test[, genes]
+prediction <- prediction[, genes]
+
+de_test_X <- de_test$layers[[par$de_test_layer]]
+prediction_X <- prediction$layers[[par$prediction_layer]]
+
+cat("Clipping values\n")
+threshold_0001 <- -log10(0.0001)
+de_test_X_clipped_0001 <- pmax(pmin(de_test_X, threshold_0001), -threshold_0001)
+prediction_clipped_0001 <- pmax(pmin(prediction_X, threshold_0001), -threshold_0001)
+
+cat("Calculate mean cosine similarity\n")
+cosine <- proxyC::simil(de_test_X, prediction_X, method = "cosine", diag = TRUE)
+mean_cosine_similarity <- mean(cosine@x)
+
+cosine_clipped <- proxyC::simil(de_test_X_clipped_0001, prediction_clipped_0001, method = "cosine", diag = TRUE)
+mean_cosine_similarity_clipped_0001 <- mean(cosine_clipped@x)
+
+cat("Create output\n")
+output <- AnnData(
+  shape = c(0L, 0L),
+  uns = list(
+    dataset_id = de_test$uns[["dataset_id"]],
+    method_id = prediction$uns[["method_id"]],
+    metric_ids = c("mean_cosine_sim_r", "mean_cosine_sim_clipped_0001_r"),
+    metric_values = c(mean_cosine_similarity, mean_cosine_similarity_clipped_0001)
+  )
+)
+
+cat("Write output\n")
+write_h5ad(output, par$output, compression = "gzip")
diff --git a/src/task/metrics/mean_rowwise_error/config.vsh.yaml b/src/task/metrics/mean_rowwise_error/config.vsh.yaml
index 239f289b..934b004a 100644
--- a/src/task/metrics/mean_rowwise_error/config.vsh.yaml
+++ b/src/task/metrics/mean_rowwise_error/config.vsh.yaml
@@ -7,13 +7,14 @@ functionality:
         label: Mean Rowwise RMSE
         summary: The mean of the root mean squared error (RMSE) of each row in the matrix.
         description: |
-          We use the **Mean Rowwise Root Mean Squared Error** to score submissions, computed as follows:
+          The **Mean Rowwise Root Mean Squared Error** is computed as follows:
 
           $$
           \textrm{MRRMSE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} (y_{ij} - \widehat{y}_{ij})^2\right)^{1/2}
           $$
 
           where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.
+
         repository_url: null
         documentation_url: null
         min: 0
@@ -32,13 +33,13 @@ functionality:
         label: Mean Rowwise MAE
         summary: The mean of the absolute error (MAE) of each row in the matrix.
         description: |
-            We use the **Mean Rowwise Absolute Error** to score submissions, computed as follows:
+          The **Mean Rowwise Absolute Error** is computed as follows:
 
-            $$
-            \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right)
-            $$
-          
-            where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.
+          $$
+          \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right)
+          $$
+
+          where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.
         repository_url: null
         documentation_url: null
         min: 0
@@ -59,9 +60,6 @@ functionality:
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages: [ fastparquet ]
   - type: nextflow
     directives:
       label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_rowwise_error/script.py b/src/task/metrics/mean_rowwise_error/script.py
index 296e2966..b79e1e08 100644
--- a/src/task/metrics/mean_rowwise_error/script.py
+++ b/src/task/metrics/mean_rowwise_error/script.py
@@ -36,10 +36,17 @@
 prediction_clipped_0001 = np.clip(prediction_X, -threshold_0001, threshold_0001)
 
 print("Calculate mean rowwise RMSE", flush=True)
-mean_rowwise_rmse = np.mean(np.sqrt(np.mean(np.square(de_test_X - prediction_X), axis=0)))
-mean_rowwise_rmse_clipped_0001 = np.mean(np.sqrt(np.mean(np.square(de_test_X_clipped_0001 - prediction_clipped_0001), axis=0)))
-mean_rowwise_mae = np.mean(np.mean(np.abs(de_test_X - prediction_X), axis=0))
-mean_rowwise_mae_clipped_0001 = np.mean(np.mean(np.abs(de_test_X_clipped_0001 - prediction_clipped_0001), axis=0))
+rowwise_rmse = np.sqrt(np.mean(np.square(de_test_X - prediction_X), axis=1))
+mean_rowwise_rmse = np.mean(rowwise_rmse)
+
+rowwise_rmse_clipped_0001 = np.sqrt(np.mean(np.square(de_test_X_clipped_0001 - prediction_clipped_0001), axis=1))
+mean_rowwise_rmse_clipped_0001 = np.mean(rowwise_rmse_clipped_0001)
+
+rowwise_mae = np.mean(np.abs(de_test_X - prediction_X), axis=1)
+mean_rowwise_mae = np.mean(rowwise_mae)
+
+rowwise_mae_clipped_0001 = np.mean(np.abs(de_test_X_clipped_0001 - prediction_clipped_0001), axis=1)
+mean_rowwise_mae_clipped_0001 = np.mean(rowwise_mae_clipped_0001)
 
 print("Create output", flush=True)
 output = ad.AnnData(
diff --git a/src/task/metrics/mean_rowwise_error_r/config.vsh.yaml b/src/task/metrics/mean_rowwise_error_r/config.vsh.yaml
new file mode 100644
index 00000000..41b202fd
--- /dev/null
+++ b/src/task/metrics/mean_rowwise_error_r/config.vsh.yaml
@@ -0,0 +1,67 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: mean_rowwise_error_r
+  info:
+    metrics:
+      - name: mean_rowwise_rmse_r
+        label: Mean Rowwise RMSE bis
+        summary: The mean of the root mean squared error (RMSE) of each row in the matrix.
+        description: |
+          We use the **Mean Rowwise Root Mean Squared Error** to score submissions, computed as follows:
+
+          $$
+          \textrm{MRRMSE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} (y_{ij} - \widehat{y}_{ij})^2\right)^{1/2}
+          $$
+
+          where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.
+        repository_url: null
+        documentation_url: null
+        min: 0
+        max: "+inf"
+        maximize: false
+      - name: mean_rowwise_rmse_clipped_0001_r
+        label: Mean Rowwise RMSE clipped at 0.0001
+        summary: The mean of the root mean squared error (RMSE) of each row in the matrix, where the values are clipped to 0.0001 adjusted p-values
+        description: This metric is the same as `mean_rowwise_rmse`, but with the values clipped to [-log10(0.0001), log10(0.0001)].
+        repository_url: null
+        documentation_url: null
+        min: 0
+        max: "+inf"
+        maximize: false
+      - name: mean_rowwise_mae_r
+        label: Mean Rowwise MAE
+        summary: The mean of the absolute error (MAE) of each row in the matrix.
+        description: |
+            We use the **Mean Rowwise Absolute Error** to score submissions, computed as follows:
+
+            $$
+            \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right)
+            $$
+          
+            where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.
+        repository_url: null
+        documentation_url: null
+        min: 0
+        max: "+inf"
+        maximize: false
+      - name: mean_rowwise_mae_clipped_0001_r
+        label: Mean Rowwise MAE clipped at 0.0001
+        summary: The mean of the absolute error (MAE) of each row in the matrix. The values are clipped to 0.0001 adjusted p-values.
+        description: This metric is the same as `mean_rowwise_mae`, but with the values clipped to [-log10(0.0001), log10(0.0001)].
+        repository_url: null
+        documentation_url: null
+        min: 0
+        max: "+inf"
+        maximize: false
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.4
+    setup:
+      - type: r
+        packages: proxyC
+  - type: nextflow
+    directives:
+      label: [ midtime, highmem, highcpu ]
\ No newline at end of file
diff --git a/src/task/metrics/mean_rowwise_error_r/script.R b/src/task/metrics/mean_rowwise_error_r/script.R
new file mode 100644
index 00000000..4be51c3c
--- /dev/null
+++ b/src/task/metrics/mean_rowwise_error_r/script.R
@@ -0,0 +1,81 @@
+library(anndata)
+
+## VIASH START (unchanged)
+par <- list(
+  de_test_h5ad = "resources/neurips-2023-data/de_test.h5ad",
+  de_test_layer = "sign_log10_pval",
+  prediction = "resources/neurips-2023-data/prediction.h5ad",
+  prediction_layer = "prediction",
+  resolve_genes = "de_test",
+  output = "output.h5ad"
+)
+## VIASH END
+
+cat("Load data\n")
+de_test <- read_h5ad(par$de_test_h5ad)
+cat("de_test: "); print(de_test)
+prediction <- read_h5ad(par$prediction)
+cat("prediction: "); print(prediction)
+
+cat("Resolve genes\n")
+genes <-
+  if (par$resolve_genes == "de_test") {
+    de_test$var_names
+  } else if (par$resolve_genes == "intersection") {
+    intersect(de_test$var_names, prediction$var_names)
+  }
+de_test <- de_test[, genes]
+prediction <- prediction[, genes]
+
+de_test_X <- de_test$layers[[par$de_test_layer]]
+prediction_X <- prediction$layers[[par$prediction_layer]]
+
+if (any(is.na(de_test_X))) {
+  stop("NA values in de_test_X")
+}
+if (any(is.na(prediction_X))) {
+  stop("NA values in prediction_X")
+}
+
+cat("Clipping values\n")
+threshold_0001 <- -log10(0.0001)
+de_test_X_clipped_0001 <- pmax(pmin(de_test_X, threshold_0001), -threshold_0001)
+prediction_clipped_0001 <- pmax(pmin(prediction_X, threshold_0001), -threshold_0001)
+
+cat("Calculate mean rowwise RMSE\n")
+rowwise_rmse <- sqrt(rowMeans((de_test_X - prediction_X)^2))
+mean_rowwise_rmse <- mean(rowwise_rmse)
+
+rowwise_rmse_clipped_0001 <- sqrt(rowMeans((de_test_X_clipped_0001 - prediction_clipped_0001)^2))
+mean_rowwise_rmse_clipped_0001 <- mean(rowwise_rmse_clipped_0001)
+
+cat("Calculate mean rowwise MAE\n")
+rowwise_mae <- rowMeans(abs(de_test_X - prediction_X))
+mean_rowwise_mae <- mean(rowwise_mae)
+
+rowwise_mae_clipped_0001 <- rowMeans(abs(de_test_X_clipped_0001 - prediction_clipped_0001))
+mean_rowwise_mae_clipped_0001 <- mean(rowwise_mae_clipped_0001)
+
+cat("Create output\n")
+output <- AnnData(
+  shape = c(0L, 0L),
+  uns = list(
+    dataset_id = de_test$uns[["dataset_id"]],
+    method_id = prediction$uns[["method_id"]],
+    metric_ids = c(
+      "mean_rowwise_rmse_r",
+      "mean_rowwise_mae_r",
+      "mean_rowwise_rmse_clipped_0001_r",
+      "mean_rowwise_mae_clipped_0001_r"
+    ),
+    metric_values = c(
+      mean_rowwise_rmse,
+      mean_rowwise_mae,
+      mean_rowwise_rmse_clipped_0001,
+      mean_rowwise_mae_clipped_0001
+    )
+  )
+)
+
+cat("Write output\n")
+write_h5ad(output, par$output, compression = "gzip")
diff --git a/src/task/process_dataset/bootstrap/script.R b/src/task/process_dataset/bootstrap/script.R
index 09cac135..a8c60f2c 100644
--- a/src/task/process_dataset/bootstrap/script.R
+++ b/src/task/process_dataset/bootstrap/script.R
@@ -39,6 +39,13 @@ for (i in seq_len(par$num_replicates)) {
   output_train_h5ad <- train_h5ad[obs_ix, var_ix]
   output_test_h5ad <- test_h5ad[, var_ix]
 
+  original_dataset_id <- output_train_h5ad$uns[["dataset_id"]]
+  dataset_id <- paste0(original_dataset_id, "_bootstrap", i)
+  output_train_h5ad$uns[["dataset_id"]] <- dataset_id
+  output_test_h5ad$uns[["dataset_id"]] <- dataset_id
+  output_train_h5ad$uns[["original_dataset_id"]] <- original_dataset_id
+  output_test_h5ad$uns[["original_dataset_id"]] <- original_dataset_id
+
   # write output
   output_train_h5ad_path <- gsub("\\*", i, par$output_train_h5ad)
   output_test_h5ad_path <- gsub("\\*", i, par$output_test_h5ad)
diff --git a/src/task/workflows/run_benchmark/config.vsh.yaml b/src/task/workflows/run_benchmark/config.vsh.yaml
index 5d2307ee..98bfa750 100644
--- a/src/task/workflows/run_benchmark/config.vsh.yaml
+++ b/src/task/workflows/run_benchmark/config.vsh.yaml
@@ -105,6 +105,9 @@ functionality:
     - name: metrics/mean_rowwise_error
     - name: metrics/mean_cosine_sim
     - name: metrics/mean_correlation
+    - name: metrics/mean_rowwise_error_r
+    - name: metrics/mean_cosine_sim_r
+    - name: metrics/mean_correlation_r
     - name: process_dataset/bootstrap
     - name: process_dataset/convert_h5ad_to_parquet
   repositories:
diff --git a/src/task/workflows/run_benchmark/main.nf b/src/task/workflows/run_benchmark/main.nf
index ab9d96c8..f449bdcb 100644
--- a/src/task/workflows/run_benchmark/main.nf
+++ b/src/task/workflows/run_benchmark/main.nf
@@ -17,8 +17,11 @@ methods = [
 // construct list of metrics
 metrics = [
   mean_rowwise_error,
+  mean_rowwise_error_r,
   mean_cosine_sim,
-  mean_correlation
+  mean_cosine_sim_r,
+  mean_correlation,
+  mean_correlation_r
 ]
 
 // helper workflow for starting a workflow based on lists of yaml files