Skip to content

Commit

Permalink
Clean up metrics (openproblems-bio#59)
Browse files Browse the repository at this point in the history
* remove clipped metrics; remove python implementations of metrics

* add clipped layer to kaggle dataset

* change default layer from sign_log10_pval to clipped_sign_log10_pval

* set non-finite values to 0

* fix metricFromState argument passing

* zapsmall

* clean up scripts

* update kaggle script

* fix scripts

* fix stability wf
  • Loading branch information
rcannood authored Jun 2, 2024
1 parent 02d3dff commit 753f37c
Show file tree
Hide file tree
Showing 41 changed files with 170 additions and 620 deletions.
21 changes: 12 additions & 9 deletions scripts/generate_kaggle_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ OUT=resources/neurips-2023-kaggle

[[ ! -d $OUT ]] && mkdir -p $OUT

aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_test.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_test.h5ad
aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_train.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_train.h5ad

# recompress h5ad files
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad', compression='gzip')"
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad', compression='gzip')"
if [[ ! -f "$OUT/2023-09-12_de_by_cell_type_test.h5ad" ]]; then
echo ">> Downloading data"
aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_test.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_test.h5ad
aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_train.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_train.h5ad

# recompress h5ad files
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad', compression='gzip')"
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad', compression='gzip')"
fi

viash run src/task/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml -- \
--input_train "$OUT/2023-09-12_de_by_cell_type_train.h5ad" \
Expand All @@ -31,7 +34,7 @@ viash run src/task/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yam
--dataset_organism homo_sapiens

echo ">> Run method"
viash run src/task/control_methods/sample/config.vsh.yaml -- \
viash run src/task/control_methods/mean_across_compounds/config.vsh.yaml -- \
--de_train_h5ad "$OUT/de_train.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--id_map "$OUT/id_map.csv" \
Expand Down
25 changes: 12 additions & 13 deletions scripts/run_benchmark_test.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
#!/bin/bash

set -e

IN="resources"
OUT="output"

[[ ! -d "$OUT" ]] && mkdir -p "$OUT"

# run benchmark
# 'input_states' looks for state.yaml files corresponding to datasets
export NXF_VER=23.04.2

cat > /tmp/params.yaml << EOF
id: neurips-2023-data
de_train_h5ad: resources/neurips-2023-data/de_train.h5ad
de_test_h5ad: resources/neurips-2023-data/de_test.h5ad
id_map: resources/neurips-2023-data/id_map.csv
method_ids: ['ground_truth', 'sample', 'mean_across_celltypes', 'mean_across_compounds']
layer: t # test a different layer
publish_dir: "output/test_run_benchmark"
output_state: state.yaml
EOF

nextflow run . \
-main-script target/nextflow/workflows/run_benchmark/main.nf \
-profile docker \
-resume \
--publish_dir "$OUT" \
--output_state "state.yaml" \
-entry auto \
--input_states "$IN/**/state.yaml"
-params-file /tmp/params.yaml
15 changes: 12 additions & 3 deletions scripts/run_benchmark_tw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
id: dge_perturbation_task
input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/**/state.yaml
param_list:
- id: neurips-2023-data
de_train_h5ad: resources/neurips-2023-data/de_train.h5ad
de_test_h5ad: resources/neurips-2023-data/de_test.h5ad
id_map: resources/neurips-2023-data/id_map.csv
layer: clipped_sign_log10_pval
- id: neurips-2023-kaggle
de_train_h5ad: resources/neurips-2023-kaggle/de_train.h5ad
de_test_h5ad: resources/neurips-2023-kaggle/de_test.h5ad
id_map: resources/neurips-2023-kaggle/id_map.csv
layer: sign_log10_pval
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
--revision main_build \
--revision remove_clipped_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
Expand Down
23 changes: 0 additions & 23 deletions scripts/run_layerclip_tw.sh

This file was deleted.

18 changes: 18 additions & 0 deletions scripts/run_stability_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

export NXF_VER=23.04.2

cat > /tmp/params.yaml <<'HERE'
id: neurips-2023-data
sc_counts: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad
method_ids: ['ground_truth', 'sample', 'mean_across_celltypes', 'mean_across_compounds']
layer: t # test a different layer
publish_dir: "output/test_stability_analysis"
output_state: "state.yaml"
HERE

nextflow run . \
-main-script target/nextflow/workflows/run_stability_analysis/main.nf \
-profile docker \
-resume \
-params-file /tmp/params.yaml
3 changes: 1 addition & 2 deletions scripts/run_stability_tw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/result
cat > /tmp/params.yaml << HERE
id: neurips-2023-data
sc_counts: s3://openproblems-bio/public/neurips-2023-competition/sc_counts_reannotated_with_counts.h5ad
layer: clipped_sign_log10_pval
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
--revision main_build \
--revision remove_clipped_build \
--pull-latest \
--main-script target/nextflow/workflows/run_stability_analysis/main.nf \
--workspace 53907369739130 \
Expand Down
2 changes: 1 addition & 1 deletion src/task/api/comp_control_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ functionality:
- name: --layer
type: string
direction: input
default: sign_log10_pval
default: clipped_sign_log10_pval
description: Which layer to use for prediction.
- name: --output
__merge__: file_prediction.yaml
Expand Down
2 changes: 1 addition & 1 deletion src/task/api/comp_method_notest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ functionality:
- name: --layer
type: string
direction: input
default: sign_log10_pval
default: clipped_sign_log10_pval
description: Which layer to use for prediction.
- name: --output
__merge__: file_prediction.yaml
Expand Down
2 changes: 1 addition & 1 deletion src/task/api/comp_metric.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ functionality:
- name: --de_test_layer
type: string
direction: input
default: sign_log10_pval
default: clipped_sign_log10_pval
description: In which layer to find the DE data.
- name: --prediction
__merge__: file_prediction.yaml
Expand Down
8 changes: 7 additions & 1 deletion src/task/api/file_de_test_h5ad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,17 @@ info:
- name: sign_log10_pval
type: double
description: |
Differential expression value (-log10(p-value) * sign(LFC)) for each gene.
Differential expression value (`-log10(p-value) * sign(LFC)`) for each gene.
Here, LFC is the estimated log-fold change in expression between the treatment
and control condition after shrinkage as calculated by Limma. Positive LFC means
the gene goes up in the treatment condition relative to the control.
required: true
- name: clipped_sign_log10_pval
type: double
description: |
A clipped version of the sign_log10_pval layer. Values are clipped to be between
-4 and 4 (i.e. `-log10(0.0001)` and `-log10(0.0001)`).
required: true
uns:
- type: string
name: dataset_id
Expand Down
8 changes: 7 additions & 1 deletion src/task/api/file_de_train_h5ad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,17 @@ info:
- name: sign_log10_pval
type: double
description: |
Differential expression value (-log10(p-value) * sign(LFC)) for each gene.
Differential expression value (`-log10(p-value) * sign(LFC)`) for each gene.
Here, LFC is the estimated log-fold change in expression between the treatment
and control condition after shrinkage as calculated by Limma. Positive LFC means
the gene goes up in the treatment condition relative to the control.
required: true
- name: clipped_sign_log10_pval
type: double
description: |
A clipped version of the sign_log10_pval layer. Values are clipped to be between
-4 and 4 (i.e. `-log10(0.0001)` and `-log10(0.0001)`).
required: true
uns:
- type: string
name: dataset_id
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/ground_truth/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ library(dplyr, warn.conflicts = FALSE)
par <- list(
de_train_h5ad = "resources/neurips-2023-data/de_train.h5ad",
de_test_h5ad = "resources/neurips-2023-data/de_test.h5ad",
layer = "sign_log10_pval",
layer = "clipped_sign_log10_pval",
id_map = "resources/neurips-2023-data/id_map.csv",
output = "resources/neurips-2023-data/output_identity.h5ad"
)
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/mean_across_celltypes/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "resources/neurips-2023-data/output_mean.h5ad",
}
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/mean_across_compounds/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "resources/neurips-2023-data/output_mean.h5ad",
}
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/mean_outcome/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "resources/neurips-2023-data/output_mean.h5ad",
}
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/sample/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ requireNamespace("anndata", quietly = TRUE)
par <- list(
de_train_h5ad = "resources/neurips-2023-data/de_train.h5ad",
de_test_h5ad = "resources/neurips-2023-data/de_test.h5ad",
layer = "sign_log10_pval",
layer = "clipped_sign_log10_pval",
id_map = "resources/neurips-2023-data/id_map.csv",
output = "resources/neurips-2023-data/output_identity.h5ad"
)
Expand Down
2 changes: 1 addition & 1 deletion src/task/control_methods/zeros/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"de_test_h5ad": "resources/neurips-2023-data/de_test.h5ad",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "resources/neurips-2023-data/output_mean.h5ad",
}
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lgc_ensemble/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ nextflow run . \
-resume \
--de_train_h5ad resources/neurips-2023-data/de_train.h5ad \
--id_map resources/neurips-2023-data/id_map.csv \
--layer sign_log10_pval \
--layer clipped_sign_log10_pval \
--epochs 2 \
--kf_n_splits 2 \
--schemes "initial;light" \
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ functionality:
- name: --layer
type: string
direction: input
default: sign_log10_pval
default: clipped_sign_log10_pval
description: Which layer to use for prediction.
- name: --train_data_aug_dir
type: file
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lgc_ensemble_prepare/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"id_map": "resources/neurips-2023-data/id_map.csv",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"epochs": 10,
"kf_n_splits": 3,
"models": ["initial", "light", "heavy"],
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/nn_retraining_with_pseudolabels/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"id_map": "resources/neurips-2023-data/id_map.csv",
"layer": "sign_log10_pval",
"layer": "clipped_sign_log10_pval",
"output": "output.h5ad",
"reps": 2,
}
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/pyboost/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
## VIASH START
par = dict(
de_train_h5ad = "resources/neurips-2023-data/de_train.h5ad",
layer = "sign_log10_pval",
layer = "clipped_sign_log10_pval",
id_map = "resources/neurips-2023-data/id_map.csv",
predictor_names = ["py_boost"],
output = "output.h5ad",
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/scape/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
id_map = "resources/neurips-2023-data/id_map.csv",
output = "output.h5ad",
output_model = None,
layer = "sign_log10_pval",
layer = "clipped_sign_log10_pval",
# cell = "NK cells",
cell = "lol",
epochs = 2,
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/transformer_ensemble/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "output.h5ad",
"num_train_epochs": 10,
"layer": "sign_log10_pval"
"layer": "clipped_sign_log10_pval"
}
meta = {
"resources_dir": "src/task/methods/transformer_ensemble",
Expand Down
46 changes: 0 additions & 46 deletions src/task/metrics/mean_correlation/config.vsh.yaml

This file was deleted.

Loading

0 comments on commit 753f37c

Please sign in to comment.