Skip to content

Commit

Permalink
fix the bootstrapping component to align with what one expects from a…
Browse files Browse the repository at this point in the history
… bootstrapping method (openproblems-bio#61)
  • Loading branch information
rcannood authored Jun 2, 2024
1 parent 1dca6d5 commit 02d3dff
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 179 deletions.
65 changes: 30 additions & 35 deletions src/task/process_dataset/bootstrap/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,56 +11,51 @@ functionality:
argument_groups:
- name: Inputs
arguments:
- name: --train_h5ad
- name: --input
type: file
required: true
direction: input
example: resources/neurips-2023-kaggle/de_train.h5ad
- name: --test_h5ad
type: file
required: true
direction: input
example: resources/neurips-2023-kaggle/de_test.h5ad
example: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad
- name: Outputs
arguments:
- name: --output_train_h5ad
- name: --output
type: file
required: true
direction: output
multiple: true
example: [de_train_bootstrap.h5ad]
- name: --output_test_h5ad
type: file
required: true
direction: output
multiple: true
example: [de_test_bootstrap.h5ad]
- name: Arguments
example: sc_counts_bootstrap.h5ad
- name: Sampling parameters
description: Parameters for sampling the bootstraps.
arguments:
- name: --num_replicates
type: integer
required: true
default: 10
description: Number of bootstraps to generate.
- name: --bootstrap_obs
type: boolean
default: true
description: Whether to sample observations.
- name: --obs_fraction
type: double
required: true
default: 0.95
description: Fraction of the training dataset obs to include in each bootstrap.
default: 1
description: Fraction of the obs of the sc_counts to include in each bootstrap.
- name: --obs_replace
type: boolean
default: true
description: Whether to sample with replacement.
- name: --bootstrap_var
type: boolean
default: false
description: Whether to sample variables.
- name: --var_fraction
type: double
required: true
default: 0.95
description: Fraction of the training & test dataset var to include in each bootstrap.
default: 1
description: Fraction of the var of the sc_counts to include in each bootstrap.
- name: --var_replace
type: boolean
default: true
description: Whether to sample with replacement.
resources:
- type: r_script
path: script.R
- type: python_script
path: script.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_r:1.0.4
setup:
- type: r
cran: [ dplyr, tidyr, purrr, tibble, arrow ]
image: ghcr.io/openproblems-bio/base_python:1.0.4
- type: nextflow
directives:
label: [ midtime, midmem, lowcpu ]
label: [ midtime, highmem, midcpu ]
55 changes: 0 additions & 55 deletions src/task/process_dataset/bootstrap/script.R

This file was deleted.

39 changes: 39 additions & 0 deletions src/task/process_dataset/bootstrap/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import anndata as ad
import numpy as np

# VIASH START
par = {
"input": "resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad",
"output": "output/sc_counts_bootstrapped_*.h5ad",
"bootstrap_obs": True,
"obs_fraction": 1,
"obs_replace": True,
"bootstrap_var": False,
"var_fraction": 1,
"var_replace": True
}
# VIASH END

# Load data
data = ad.read_h5ad(par["input"])

if par["bootstrap_obs"]:
# Sample indices
obs_ix = np.random.choice(
range(data.n_obs),
int(data.n_obs * par["obs_fraction"]),
replace=par["obs_replace"]
)
data = data[obs_ix, :]

if par["bootstrap_var"]:
# Sample indices
var_ix = np.random.choice(
range(data.n_vars),
int(data.n_vars * par["var_fraction"]),
replace=par["var_replace"]
)
data = data[:, var_ix]

# Write output
data.write_h5ad(par["output"], compression="gzip")
46 changes: 0 additions & 46 deletions src/task/process_dataset/bootstrap_sc_counts/config.vsh.yaml

This file was deleted.

32 changes: 0 additions & 32 deletions src/task/process_dataset/bootstrap_sc_counts/script.py

This file was deleted.

27 changes: 18 additions & 9 deletions src/task/workflows/run_stability_analysis/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,30 @@ functionality:
- name: Bootstrapping arguments
description: Define the sampling strategy for the stability analysis.
arguments:
- name: --bootstrap_num_replicates
type: integer
required: true
default: 10
description: Number of bootstraps to generate.
- name: --bootstrap_obs
type: boolean
default: true
description: Whether to sample observations.
- name: --bootstrap_obs_fraction
type: double
required: true
default: 0.95
default: 1
description: Fraction of the obs of the sc_counts to include in each bootstrap.
- name: --bootstrap_obs_replace
type: boolean
default: true
description: Whether to sample with replacement.
- name: --bootstrap_var
type: boolean
default: false
description: Whether to sample variables.
- name: --bootstrap_var_fraction
type: double
required: true
default: 1
description: Fraction of the var of the sc_counts to include in each bootstrap.
- name: --bootstrap_var_replace
type: boolean
default: true
description: Whether to sample with replacement.
- name: Outputs
arguments:
- name: "--scores"
Expand All @@ -60,7 +69,7 @@ functionality:
- type: file
path: "../../api/task_info.yaml"
dependencies:
- name: process_dataset/bootstrap_sc_counts
- name: process_dataset/bootstrap
- name: workflows/process_dataset
- name: workflows/run_benchmark
repositories:
Expand Down
8 changes: 6 additions & 2 deletions src/task/workflows/run_stability_analysis/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@ workflow run_wf {
}
}

| bootstrap_sc_counts.run(
| bootstrap.run(
fromState: [
input: "sc_counts",
bootstrap_obs: "bootstrap_obs",
obs_fraction: "bootstrap_obs_fraction",
var_fraction: "bootstrap_var_fraction"
obs_replace: "bootstrap_obs_replace",
bootstrap_var: "bootstrap_var",
var_fraction: "bootstrap_var_fraction",
var_replace: "bootstrap_var_replace"
],
toState: [
sc_counts: "output"
Expand Down

0 comments on commit 02d3dff

Please sign in to comment.