add cell obs to uns (openproblems-bio#51)

* update jn_ap_op2 * add cell obs to uns * fix script * refactor transformer_ensemble and convert_h5ad_to_parquet * update scape * update pyboost * fix methods * fix control methods * fix test * fix baselines * fix api and wfs * fix methods and metrics * fix components * add layer argument to metrics as well * add layer argument to benchmark wf * remvoe outdated input * add t statistic to output table * fixes * demote lstm gpu usage * fix code * fix previous commit * fix workflow * move workflow * switch to midgpu * simplify description * fix api * fix wf * add more helpful messages * fix prediction extension * remove references to parquet * allow specifying different test and prediction layers * add gene resolution strategy * try to parallellize with nn.DataParallel * fix args * create nextflow workflow for lgc ensemble * fix model names * fix config * enable lazy loading for the models * revert `load_trained_models` function
Paulos2411 · May 29, 2024 · 1f6afe2 · 1f6afe2
1 parent 5934a85
commit 1f6afe2
Show file tree

Hide file tree

Showing 73 changed files with 1,999 additions and 1,166 deletions.
diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
@@ -30,18 +30,18 @@ viash run src/task/methods/$method_id/config.vsh.yaml -- \
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
   --id_map "resources/neurips-2023-kaggle/id_map.csv" \
-  --output "output/prediction.parquet"
+  --output "output/prediction.h5ad"
 
 # run the method (using h5ad as input)
 viash run src/task/methods/$method_id/config.vsh.yaml -- \
   --de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
   --id_map "resources/neurips-2023-kaggle/id_map.csv" \
-  --output "output/prediction.parquet"
+  --output "output/prediction.h5ad"
 
 # run evaluation metric
 viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
-  --prediction "output/prediction.parquet" \
+  --prediction "output/prediction.h5ad" \
   --output "output/score.h5ad"
 
 # print score on kaggle test dataset

diff --git a/scripts/generate_kaggle_resources.sh b/scripts/generate_kaggle_resources.sh
@@ -18,9 +18,8 @@ python -c \
 viash run src/task/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml -- \
   --input_train "$OUT/2023-09-12_de_by_cell_type_train.h5ad" \
   --input_test "$OUT/2023-09-12_de_by_cell_type_test.h5ad" \
-  --output_train "$OUT/de_train.parquet" \
+  --input_single_cell_h5ad "resources/neurips-2023-raw/sc_counts.h5ad" \
   --output_train_h5ad "$OUT/de_train.h5ad" \
-  --output_test "$OUT/de_test.parquet" \
   --output_test_h5ad "$OUT/de_test.h5ad" \
   --output_id_map "$OUT/id_map.csv" \
   --dataset_id neurips-2023-kaggle \
@@ -33,22 +32,19 @@ viash run src/task/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yam
 
 echo ">> Run method"
 viash run src/task/control_methods/sample/config.vsh.yaml -- \
-  --de_train "$OUT/de_train.parquet" \
-  --de_test "$OUT/de_test.parquet" \
+  --de_train_h5ad "$OUT/de_train.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
-  --output "$OUT/prediction.parquet"
+  --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
-  --prediction "$OUT/prediction.parquet" \
-  --method_id "sample" \
+  --prediction "$OUT/prediction.h5ad" \
   --de_test_h5ad "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 
 cat > "$OUT/state.yaml" <<'EOF'
 id: neurips-2023-kaggle
-de_train: !file de_train.parquet
-de_test: !file de_test.parquet
 de_train_h5ad: !file de_train.h5ad
 de_test_h5ad: !file de_test.h5ad
 id_map: !file id_map.csv

diff --git a/scripts/generate_resources.sh b/scripts/generate_resources.sh
@@ -16,7 +16,7 @@ fi
 
 echo ">> Running 'process_dataset' workflow"
 nextflow run \
-  target/nextflow/process_dataset/workflow/main.nf \
+  target/nextflow/workflows/process_dataset/main.nf \
   -profile docker \
   -resume \
   --id neurips-2023-data \
@@ -33,15 +33,14 @@ nextflow run \
 
 echo ">> Run method"
 viash run src/task/control_methods/sample/config.vsh.yaml -- \
-  --de_train "$OUT/de_train.parquet" \
-  --de_test "$OUT/de_test.parquet" \
+  --de_train_h5ad "$OUT/de_train.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
-  --output "$OUT/prediction.parquet"
+  --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \
-  --prediction "$OUT/prediction.parquet" \
-  --method_id "sample" \
+  --prediction "$OUT/prediction.h5ad" \
   --de_test_h5ad "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 

diff --git a/src/common/nextflow_helpers/labels_tw.config b/src/common/nextflow_helpers/labels_tw.config
@@ -44,19 +44,20 @@ process {
     containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
        ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
-  withLabel: midgpu {
-    cpus = 32
+  withLabel: midgpu { // aiming for g4dn.12xlarge
+    memory = 150.GB
+    cpus = 40
     accelerator = 4
     containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
        ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
-  withLabel: highgpu {
-    cpus = 64
+  withLabel: highgpu { // aiming for g4dn.metal
+    memory = 300.GB
+    cpus = 80
     accelerator = 8
     containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
        ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
   }
-
   // make sure publishstates gets enough disk space and memory
   withName:'.*publishStatesProc' {
     memory = '16GB'

diff --git a/src/task/api/comp_control_method.yaml b/src/task/api/comp_control_method.yaml
@@ -8,18 +8,23 @@ functionality:
       description: |
         A control method to predict perturbation effects.
   arguments:
-    - name: --de_train
-      __merge__: file_de_train_parquet.yaml
-      required: true
+    - name: --de_train_h5ad
+      __merge__: file_de_train_h5ad.yaml
+      required: false
       direction: input
-    - name: --de_test
-      __merge__: file_de_test_parquet.yaml
+    - name: --de_test_h5ad
+      __merge__: file_de_test_h5ad.yaml
       required: true
       direction: input
     - name: --id_map
       __merge__: file_id_map.yaml
       required: true
       direction: input
+    - name: --layer
+      type: string
+      direction: input
+      default: sign_log10_pval
+      description: Which layer to use for prediction.
     - name: --output
       __merge__: file_prediction.yaml
       required: true

diff --git a/src/task/api/comp_method.yaml b/src/task/api/comp_method.yaml
@@ -1,35 +1,5 @@
+__merge__: comp_method_notest.yaml
 functionality:
-  namespace: "methods"
-  info:
-    type: method
-    type_info:
-      label: Method
-      summary: A regression method.
-      description: |
-        A regression method to predict the expression of one modality from another.
-  arguments:
-    - name: --de_train
-      __merge__: file_de_train_parquet.yaml
-      required: false
-      direction: input
-    - name: --de_train_h5ad
-      __merge__: file_de_train_h5ad.yaml
-      required: false
-      direction: input
-    - name: --id_map
-      __merge__: file_id_map.yaml
-      required: true
-      direction: input
-    - name: --output
-      __merge__: file_prediction.yaml
-      required: true
-      direction: output
-    - name: "--output_model"
-      type: "file"
-      description: "Optional model output. If no value is passed, the model will be removed at the end of the run."
-      direction: output
-      required: false
-      must_exist: false
   test_resources:
     - type: python_script
       path: /src/common/component_tests/run_and_check_output.py

diff --git a/src/task/api/comp_method_notest.yaml b/src/task/api/comp_method_notest.yaml
@@ -0,0 +1,33 @@
+functionality:
+  namespace: "methods"
+  info:
+    type: method
+    type_info:
+      label: Method
+      summary: A regression method.
+      description: |
+        A regression method to predict the expression of one modality from another.
+  arguments:
+    - name: --de_train_h5ad
+      __merge__: file_de_train_h5ad.yaml
+      required: false
+      direction: input
+    - name: --id_map
+      __merge__: file_id_map.yaml
+      required: true
+      direction: input
+    - name: --layer
+      type: string
+      direction: input
+      default: sign_log10_pval
+      description: Which layer to use for prediction.
+    - name: --output
+      __merge__: file_prediction.yaml
+      required: true
+      direction: output
+    - name: "--output_model"
+      type: "file"
+      description: "Optional model output. If no value is passed, the model will be removed at the end of the run."
+      direction: output
+      required: false
+      must_exist: false
diff --git a/src/task/api/comp_method_run.yaml b/src/task/api/comp_method_run.yaml
diff --git a/src/task/api/comp_method_train.yaml b/src/task/api/comp_method_train.yaml
diff --git a/src/task/api/comp_metric.yaml b/src/task/api/comp_metric.yaml
@@ -12,20 +12,31 @@ functionality:
       __merge__: file_de_test_h5ad.yaml
       required: true
       direction: input
+    - name: --de_test_layer
+      type: string
+      direction: input
+      default: sign_log10_pval
+      description: In which layer to find the DE data.
     - name: --prediction
       __merge__: file_prediction.yaml
       required: true
       direction: input
-    - name: --method_id
+    - name: --prediction_layer
       type: string
-      required: true
       direction: input
-      info:
-        test_default: test
+      default: prediction
+      description: In which layer to find the predicted DE data.
     - name: --output
       __merge__: file_score.yaml
       direction: output
       required: true
+    - name: --resolve_genes
+      type: string
+      direction: input
+      default: de_test
+      choices: [de_test, intersection]
+      description: |
+        How to resolve difference in genes between the two datasets.
   test_resources:
     - type: python_script
       path: /src/common/component_tests/run_and_check_output.py

diff --git a/src/task/api/comp_process_dataset.yaml b/src/task/api/comp_process_dataset.yaml
@@ -1,5 +1,4 @@
 functionality:
-  namespace: "process_dataset"
   info:
     type: process_dataset
     type_info:
@@ -12,16 +11,6 @@ functionality:
       __merge__: file_sc_counts.yaml
       required: true
       direction: input
-    - name: --de_train
-      __merge__: file_de_train_parquet.yaml
-      required: true
-      direction: output
-      default: de_train.parquet
-    - name: --de_test
-      __merge__: file_de_test_parquet.yaml
-      required: true
-      direction: output
-      default: de_test.parquet
     - name: --de_train_h5ad
       __merge__: file_de_train_h5ad.yaml
       required: true

diff --git a/src/task/api/file_de_test_h5ad.yaml b/src/task/api/file_de_test_h5ad.yaml
@@ -41,6 +41,18 @@ info:
         description: "Boolean indicating whether this instance was used as a control."
         required: true
     layers:
+      - name: logFC
+        type: double
+        description: "Log fold change of the differential expression test"
+        required: true
+      - name: AveExpr
+        type: double
+        description: "Average expression of the differential expression test"
+        required: false
+      - name: t
+        type: double
+        description: "T-statistic of the differential expression test"
+        required: false
       - name: P.Value
         type: double
         description: "P-value of the differential expression test"
@@ -49,6 +61,10 @@ info:
         type: double
         description: "Adjusted P-value of the differential expression test"
         required: true
+      - name: B
+        type: double
+        description: "B-statistic of the differential expression test"
+        required: false
       - name: is_de
         type: boolean
         description: "Whether the gene is differentially expressed"
@@ -57,10 +73,6 @@ info:
         type: boolean
         description: "Whether the gene is differentially expressed after adjustment"
         required: true
-      - name: logFC
-        type: double
-        description: "Log fold change of the differential expression test"
-        required: true
       - name: sign_log10_pval
         type: double
         description: |
@@ -100,3 +112,8 @@ info:
         description: The organism of the sample in the dataset.
         required: false
         multiple: true
+      - name: single_cell_obs
+        type: dataframe
+        description: |
+          A dataframe with the cell-level metadata.
+        required: true