Add clipping to layer (openproblems-bio#57)

* add a clipped layer to the limma component * add scripts * fix labels
Paulos2411 · May 31, 2024 · e8451aa · e8451aa
1 parent 3addfba
commit e8451aa
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 7 deletions.
diff --git a/scripts/run_benchmark_test.sh b/scripts/run_benchmark_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+
+IN="resources"
+OUT="output"
+
+[[ ! -d "$OUT" ]] && mkdir -p "$OUT"
+
+# run benchmark
+# 'input_states' looks for state.yaml files corresponding to datasets
+export NXF_VER=23.04.2
+
+nextflow run . \
+  -main-script target/nextflow/workflows/run_benchmark/main.nf \
+  -profile docker \
+  -resume \
+  --publish_dir "$OUT" \
+  --output_state "state.yaml" \
+  -entry auto \
+  --input_states "$IN/**/state.yaml"
diff --git a/scripts/run_layerclip_tw.sh b/scripts/run_layerclip_tw.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+RUN_ID="layerclip_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+id: dge_perturbation_task
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
+settings: '{"layer": "clipped_sign_log10_pval"}'
+HERE
+
+tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
+  --revision main_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/run_tval_tw.sh → scripts/run_layert_tw.sh b/scripts/run_tval_tw.sh → scripts/run_layert_tw.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RUN_ID="tval_$(date +%Y-%m-%d_%H-%M-%S)"
+RUN_ID="layert_$(date +%Y-%m-%d_%H-%M-%S)"
 publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"
 
 cat > /tmp/params.yaml << HERE
@@ -13,7 +13,7 @@ settings: '{"layer": "t"}'
 HERE
 
 tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
-  --revision add_cell_obs_to_uns_build \
+  --revision main_build \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \

diff --git a/src/task/control_methods/mean_across_celltypes/config.vsh.yaml b/src/task/control_methods/mean_across_celltypes/config.vsh.yaml
@@ -2,7 +2,7 @@ __merge__: ../../api/comp_control_method.yaml
 functionality:
   name: mean_across_celltypes
   info:
-    label: Mean outcome for the cell type
+    label: Mean per cell type and gene
     summary: Baseline method that returns mean of cell type's outcomes
     description: |
       Baseline method that predicts for a cell type the mean of its outcomes of all compounds.

diff --git a/src/task/control_methods/mean_across_compounds/config.vsh.yaml b/src/task/control_methods/mean_across_compounds/config.vsh.yaml
@@ -2,7 +2,7 @@ __merge__: ../../api/comp_control_method.yaml
 functionality:
   name: mean_across_compounds
   info:
-    label: Mean outcome for the compound
+    label: Mean per compound and gene
     summary: Baseline method that returns mean of compound's outcomes
     description: |
       Baseline method that predicts for a compound the mean of its outcomes of all samples.

diff --git a/src/task/control_methods/mean_outcome/config.vsh.yaml b/src/task/control_methods/mean_outcome/config.vsh.yaml
@@ -2,7 +2,7 @@ __merge__: ../../api/comp_control_method.yaml
 functionality:
   name: mean_outcome
   info:
-    label: Mean outcome for a gene
+    label: Mean per gene
     summary: Baseline method that returns mean of gene's outcomes
     description: |
       Baseline method that predicts for a gene the mean of its outcomes of all samples.

diff --git a/src/task/process_dataset/run_limma/config.vsh.yaml b/src/task/process_dataset/run_limma/config.vsh.yaml
@@ -32,6 +32,11 @@ functionality:
       type: double
       required: false
       default: 0.05
+    - name: --clipping_cutoff
+      type: double
+      required: false
+      default: 0.0001
+      description: Clip the log p-values between log10(clip) and -log10(clip)
     - name: --control_compound
       type: string
       required: false

diff --git a/src/task/process_dataset/run_limma/script.R b/src/task/process_dataset/run_limma/script.R
@@ -110,7 +110,9 @@ de_df2 <- de_df %>%
     sign_log10_adj_pval = sign(logFC) * -log10(ifelse(adj.P.Value == 0, .Machine$double.eps, adj.P.Value)),
     # determine if gene is DE
     is_de = P.Value < par$de_sig_cutoff,
-    is_de_adj = adj.P.Value < par$de_sig_cutoff
+    is_de_adj = adj.P.Value < par$de_sig_cutoff,
+    # compute clipped sign fc × log10 p-values
+    clipped_sign_log10_pval = sign(logFC) * -log10(pmax(par$clipping_cutoff, P.Value)),
   ) %>%
   as_tibble()
 
@@ -122,7 +124,7 @@ rownames(new_obs) <- paste0(new_obs$cell_type, ", ", new_obs$sm_name)
 new_var <- data.frame(row.names = levels(de_df2$gene))
 
 # create layers from de_df
-layer_names <- c("is_de", "is_de_adj", "logFC", "AveExpr", "t", "P.Value", "adj.P.Value", "B", "sign_log10_adj_pval", "sign_log10_pval")
+layer_names <- c("is_de", "is_de_adj", "logFC", "AveExpr", "t", "P.Value", "adj.P.Value", "B", "sign_log10_adj_pval", "sign_log10_pval", "clipped_sign_log10_pval")
 layers <- map(setNames(layer_names, layer_names), function(layer_name) {
   de_df2 %>%
     select(gene, row_i, !!layer_name) %>%