Skip to content

Commit

Permalink
Update_scripts (openproblems-bio#52)
Browse files Browse the repository at this point in the history
* update

* fix labels

* add back epoch logging

* add message

* revert multigpu changes in LGC_ensemble

* simply tower config

* fix scripts

* use mean_across_compounds as test data

* implement alternative implementations of the metrics in R

* fix descriptions, add alternative metrics in wf

* apply patch to fix lgc_ensemble on bootstrapped data

Co-authored-by: ttunja <60556758+ttunja@users.noreply.github.com>

* fix component names

* fix metric ids

* fill std nas in transformer_ensemble

* fix resource

* fix method id

---------

Co-authored-by: ttunja <60556758+ttunja@users.noreply.github.com>
  • Loading branch information
rcannood and ttunja authored May 30, 2024
1 parent 1f6afe2 commit 22fc360
Show file tree
Hide file tree
Showing 29 changed files with 454 additions and 110 deletions.
8 changes: 1 addition & 7 deletions scripts/add_a_method.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ viash test src/task/methods/$method_id/config.vsh.yaml
viash run src/task/methods/$method_id/config.vsh.yaml -- \
---setup cachedbuild ---verbose

# run the method (using parquet as input)
viash run src/task/methods/$method_id/config.vsh.yaml -- \
--de_train "resources/neurips-2023-kaggle/de_train.parquet" \
--id_map "resources/neurips-2023-kaggle/id_map.csv" \
--output "output/prediction.h5ad"

# run the method (using h5ad as input)
viash run src/task/methods/$method_id/config.vsh.yaml -- \
--de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
Expand All @@ -40,7 +34,7 @@ viash run src/task/methods/$method_id/config.vsh.yaml -- \

# run evaluation metric
viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
--de_test "resources/neurips-2023-kaggle/de_test.parquet" \
--de_test_h5ad "resources/neurips-2023-kaggle/de_test.h5ad" \
--prediction "output/prediction.h5ad" \
--output "output/score.h5ad"

Expand Down
2 changes: 1 addition & 1 deletion scripts/generate_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ nextflow run \
--publish_dir "$OUT"

echo ">> Run method"
viash run src/task/control_methods/sample/config.vsh.yaml -- \
viash run src/task/control_methods/mean_across_compounds/config.vsh.yaml -- \
--de_train_h5ad "$OUT/de_train.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--id_map "$OUT/id_map.csv" \
Expand Down
21 changes: 0 additions & 21 deletions scripts/run_benchmark.sh

This file was deleted.

23 changes: 0 additions & 23 deletions scripts/run_stability_test.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/result

cat > /tmp/params.yaml << HERE
id: dge_perturbation_task
input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/**/state.yaml
input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
output_state: "state.yaml"
publish_dir: "$publish_dir"
rename_keys: "de_train:de_train,de_train_h5ad:de_train_h5ad,de_test:de_test,de_test_h5ad:de_test_h5ad,id_map:id_map"
rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
settings: '{"stability": true, "stability_obs_fraction": 0.99, "stability_var_fraction": 0.99}'
HERE

tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
--revision add_bootstrapping_build \
--revision main_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
Expand Down
23 changes: 23 additions & 0 deletions scripts/run_tval_tw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

RUN_ID="tval_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
id: dge_perturbation_task
input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
output_state: "state.yaml"
publish_dir: "$publish_dir"
rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
settings: '{"layer": "t"}'
HERE

tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
--revision add_cell_obs_to_uns_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--entry-name auto \
--config src/common/nextflow_helpers/labels_tw.config
5 changes: 5 additions & 0 deletions scripts/sync_results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@ aws s3 sync \
s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
output/benchmark_results/ \
--delete --dryrun

aws s3 sync \
output/benchmark_results/ \
s3://openproblems-data/resources/dge_perturbation_prediction/results/ \
--delete --dryrun
15 changes: 5 additions & 10 deletions src/common/nextflow_helpers/labels_tw.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,33 +30,28 @@ process {
disk = { 400.GB * task.attempt }
}
withLabel: lowsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""}
containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" }
}
withLabel: midsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""}
containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" }
}
withLabel: highsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
containerOptions = { "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" }
}
withLabel: gpu {
cpus = 16
memory = 100.GB
cpus = 20
accelerator = 1
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
withLabel: midgpu { // aiming for g4dn.12xlarge
memory = 150.GB
cpus = 40
accelerator = 4
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
withLabel: highgpu { // aiming for g4dn.metal
memory = 300.GB
cpus = 80
accelerator = 8
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
// make sure publishstates gets enough disk space and memory
withName:'.*publishStatesProc' {
Expand Down
19 changes: 6 additions & 13 deletions src/task/methods/lgc_ensemble_helpers/helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def train_function(model, model_name, x_train, y_train, x_val, y_val, info_data,
if val_mrrmse < best_loss:
best_loss = val_mrrmse
best_weights = model.state_dict()
# print('BEST ----> ')
# print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
print('BEST ----> ')
print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
t1 = time.time()
results['runtime'] = float(t1-t0)
model.load_state_dict(best_weights)
Expand All @@ -230,19 +230,12 @@ def cross_validate_models(X, y, kf_cv, cell_types_sm_names, paths, config=None,
'val_sm_name': cell_types_sm_names.iloc[val_idx]['sm_name'].tolist()}
for Model in [LSTM, Conv, GRU]:
model = Model(scheme, X.shape, y.shape)

if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model_name = model.module.name
else:
model_name = model.name

model, results = train_function(model, model_name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
model, results = train_function(model, model.name, x_train, y_train, x_val, y_val, info_data, config=config, clip_norm=clip_norm)
model.to('cpu')
trained_models.append(model)
print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model_name}_{scheme}_fold{i}.pt')
with open(f'{paths["logs_dir"]}/{model_name}_{scheme}_fold{i}.json', 'w') as file:
print(f'PATH OF THE MODEL EQUALS: {paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
torch.save(model.state_dict(), f'{paths["model_dir"]}/pytorch_{model.name}_{scheme}_fold{i}.pt')
with open(f'{paths["logs_dir"]}/{model.name}_{scheme}_fold{i}.json', 'w') as file:
json.dump(results, file)
if torch.cuda.is_available():
torch.cuda.empty_cache()
Expand Down
3 changes: 2 additions & 1 deletion src/task/methods/lgc_ensemble_helpers/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def prepare_data(par, paths):
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
std_sm_name_filled = std_sm_name.fillna(0)
cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\
.quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1)
Expand All @@ -30,7 +31,7 @@ def prepare_data(par, paths):
mean_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
std_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv', index=False)
mean_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
std_sm_name_filled.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False)
quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
## Create one hot encoding features
one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"])
Expand Down
3 changes: 2 additions & 1 deletion src/task/methods/lgc_ensemble_predict/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,14 @@
df_sub.reset_index(drop=True, inplace=True)

# write output
method_id = meta["functionality_name"].replace("_predict", "")
output = ad.AnnData(
layers={"prediction": df_sub.to_numpy()},
obs=pd.DataFrame(index=id_map["id"]),
var=pd.DataFrame(index=gene_names),
uns={
"dataset_id": train_config["DATASET_ID"],
"method_id": meta["functionality_name"]
"method_id": method_id
}
)
print(output)
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lgc_ensemble_prepare/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,4 @@ platforms:
- type: native
- type: nextflow
directives:
label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
label: [hightime, veryhighmem, highcpu]
3 changes: 2 additions & 1 deletion src/task/methods/lgc_ensemble_prepare/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
std_sm_name_filled = std_sm_name.fillna(0)
cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line
quantiles_cell_type = pd.concat(
[pd.DataFrame(cell_types)] +
Expand All @@ -74,7 +75,7 @@
mean_cell_type.to_csv(f'{par["train_data_aug_dir"]}/mean_cell_type.csv', index=False)
std_cell_type.to_csv(f'{par["train_data_aug_dir"]}/std_cell_type.csv', index=False)
mean_sm_name.to_csv(f'{par["train_data_aug_dir"]}/mean_sm_name.csv', index=False)
std_sm_name.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
std_sm_name_filled.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False)
quantiles_cell_type.to_csv(f'{par["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False)
with open(f'{par["train_data_aug_dir"]}/gene_names.json', 'w') as f:
json.dump(gene_names, f)
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lgc_ensemble_train/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ platforms:
- type: native
- type: nextflow
directives:
label: [hightime, veryhighmem, highcpu]
label: [hightime, veryhighmem, highcpu, highsharedmem, gpu]
4 changes: 4 additions & 0 deletions src/task/methods/lgc_ensemble_train/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import json
import numpy as np
import pandas as pd
if torch.cuda.is_available():
print("using device: cuda", flush=True)
else:
print('using device: cpu', flush=True)

## VIASH START
par = {
Expand Down
4 changes: 2 additions & 2 deletions src/task/methods/transformer_ensemble/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ def prepare_augmented_data(
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]

mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0)

mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()
std_sm_name = de_sm_name.groupby('sm_name').std().reset_index().fillna(0)

# Append mean and std for 'cell_type'
rows = []
Expand Down
19 changes: 14 additions & 5 deletions src/task/metrics/mean_correlation/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@ functionality:
label: Mean Pearson
summary: The mean of Pearson correlations per row (perturbation).
description: |
We use the **Mean Pearson Correlation** to score submissions.
The **Mean Pearson Correlation** is computed as follows:
$$
\textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
$$
where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
repository_url: null
documentation_url: null
min: -1
Expand All @@ -17,7 +23,13 @@ functionality:
label: Mean Spearman
summary: The mean of Spearman correlations per row (perturbation).
description: |
We use the **Mean Spearman Correlation** to score submissions.
The **Mean Spearman Correlation** is computed as follows:
$$
\textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
$$
where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
repository_url: null
documentation_url: null
min: -1
Expand All @@ -29,9 +41,6 @@ functionality:
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [ fastparquet ]
- type: nextflow
directives:
label: [ midtime, highmem, highcpu ]
46 changes: 46 additions & 0 deletions src/task/metrics/mean_correlation_r/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
__merge__: ../../api/comp_metric.yaml
functionality:
name: mean_correlation_r
info:
metrics:
- name: mean_pearson_r
label: Mean Pearson bis
summary: The mean of Pearson correlations per row (perturbation).
description: |
The **Mean Pearson Correlation** is computed as follows:
$$
\textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)}
$$
where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.
repository_url: null
documentation_url: null
min: -1
max: 1
maximize: true
- name: mean_spearman_r
label: Mean Spearman bis
summary: The mean of Spearman correlations per row (perturbation).
description: |
The **Mean Spearman Correlation** is computed as follows:
$$
\textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)}
$$
where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.
repository_url: null
documentation_url: null
min: -1
max: 1
maximize: true
resources:
- type: r_script
path: script.R
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_r:1.0.4
- type: nextflow
directives:
label: [ midtime, highmem, highcpu ]
Loading

0 comments on commit 22fc360

Please sign in to comment.