Multiple fixes to components (openproblems-bio#37)

* fix method names and descriptions * remove rf * tweak resources * make output model an optional argument in the method api * fix metadata * output model does not need to be created
Paulos2411 · May 18, 2024 · ba4429d · ba4429d
1 parent 4fa04bb
commit ba4429d
Show file tree

Hide file tree

Showing 25 changed files with 56 additions and 130 deletions.
diff --git a/src/common/component_tests/run_and_check_output.py b/src/common/component_tests/run_and_check_output.py
@@ -7,8 +7,8 @@
 
 ## VIASH START
 meta = {
-    "executable": "target/docker/methods/first_place/first_place",
-    "config": "target/docker/methods/first_place/.config.vsh.yaml",
+    "executable": "target/docker/methods/lstm_gru_cnn_ensemble/lstm_gru_cnn_ensemble",
+    "config": "target/docker/methods/lstm_gru_cnn_ensemble/.config.vsh.yaml",
     "resources_dir": "resources"
 }
 ## VIASH END

diff --git a/src/common/nextflow_helpers/labels_tw.config b/src/common/nextflow_helpers/labels_tw.config
@@ -35,6 +35,7 @@ process {
     containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
   }
   withLabel: gpu {
+    cpus: 8
     accelerator = 1
     containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
        ( workflow.containerEngine == "docker" ? '--gpus all': null ) }

diff --git a/src/task/api/comp_method.yaml b/src/task/api/comp_method.yaml
@@ -24,6 +24,12 @@ functionality:
       __merge__: file_prediction.yaml
       required: true
       direction: output
+    - name: "--output_model"
+      type: "file"
+      description: "Optional model output. If no value is passed, the model will be removed at the end of the run."
+      direction: output
+      required: false
+      must_exist: false
   test_resources:
     - type: python_script
       path: /src/common/component_tests/run_and_check_output.py

diff --git a/src/task/methods/jn_ap_op2/config.vsh.yaml b/src/task/methods/jn_ap_op2/config.vsh.yaml
@@ -3,7 +3,8 @@ __merge__: ../../api/comp_method.yaml
 functionality:
   name: jn_ap_op2
   info:
-    label: A deep NN model proposed by Antoine Passiemier and Jalil Nourisa
+    label: JN-AP-OP2
+    rank: 20
     summary: "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP"
     description: |
       We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode,
@@ -42,4 +43,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [ hightime, highmem, highcpu, gpu ]
+      label: [ hightime, midmem, highcpu, gpu ]
diff --git a/...ethods/first_place/closest_sqrt_factor.py → ...m_gru_cnn_ensemble/closest_sqrt_factor.py b/...ethods/first_place/closest_sqrt_factor.py → ...m_gru_cnn_ensemble/closest_sqrt_factor.py
diff --git a/src/task/methods/first_place/config.vsh.yaml → ...ods/lstm_gru_cnn_ensemble/config.vsh.yaml b/src/task/methods/first_place/config.vsh.yaml → ...ods/lstm_gru_cnn_ensemble/config.vsh.yaml
@@ -1,9 +1,10 @@
 __merge__: ../../api/comp_method.yaml
 
 functionality:
-  name: first_place
+  name: lstm_gru_cnn_ensemble
   info:
-    label: First Place
+    label: LSTM-GRU-CNN Ensemble
+    rank: 1
     summary: An ensemble of LSTM, GRU, and 1D CNN models
     description: |
       An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings,
@@ -15,12 +16,6 @@ functionality:
     repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main
 
   arguments:
-    - name: "--extra_output"
-      type: "file"
-      default: "resources/neurips-2023-data/extra_output/"
-      description: "Extra output directory. Will contain folders `prepared_data`, `trained_models`, and `logs`."
-      direction: output
-      required: false
     - name: --epochs
       type: integer
       default: 250

diff --git a/...ask/methods/first_place/helper_classes.py → ...s/lstm_gru_cnn_ensemble/helper_classes.py b/...ask/methods/first_place/helper_classes.py → ...s/lstm_gru_cnn_ensemble/helper_classes.py
diff --git a/...k/methods/first_place/helper_functions.py → ...lstm_gru_cnn_ensemble/helper_functions.py b/...k/methods/first_place/helper_functions.py → ...lstm_gru_cnn_ensemble/helper_functions.py
diff --git a/src/task/methods/first_place/models.py → ...k/methods/lstm_gru_cnn_ensemble/models.py b/src/task/methods/first_place/models.py → ...k/methods/lstm_gru_cnn_ensemble/models.py
diff --git a/src/task/methods/first_place/predict.py → .../methods/lstm_gru_cnn_ensemble/predict.py b/src/task/methods/first_place/predict.py → .../methods/lstm_gru_cnn_ensemble/predict.py
diff --git a/src/task/methods/first_place/prepare_data.py → ...ods/lstm_gru_cnn_ensemble/prepare_data.py b/src/task/methods/first_place/prepare_data.py → ...ods/lstm_gru_cnn_ensemble/prepare_data.py
diff --git a/src/task/methods/first_place/script.py → ...k/methods/lstm_gru_cnn_ensemble/script.py b/src/task/methods/first_place/script.py → ...k/methods/lstm_gru_cnn_ensemble/script.py
@@ -11,10 +11,10 @@
     "epochs": 1,
     "kf_n_splits": 2,
     "output": "output.parquet",
-    "extra_output": None
+    "output_model": None
 }
 meta = {
-    "resources_dir": "src/task/methods/first_place",
+    "resources_dir": "src/task/methods/lstm_gru_cnn_ensemble",
     "temp_dir": "/tmp"
 }
 ## VIASH END
@@ -27,19 +27,19 @@
 from predict import predict
 
 # create a temporary directory for storing models
-extra_output = par["extra_output"] or tempfile.TemporaryDirectory(dir = meta["temp_dir"]).name
+output_model = par["output_model"] or tempfile.TemporaryDirectory(dir = meta["temp_dir"]).name
 paths = {
     "output": par["output"],
-    "extra_output": extra_output,
-    "train_data_aug_dir": f"{extra_output}/train_data_aug_dir",
-    "model_dir": f"{extra_output}/model_dir",
-    "logs_dir": f"{extra_output}/logs"
+    "output_model": output_model,
+    "train_data_aug_dir": f"{output_model}/train_data_aug_dir",
+    "model_dir": f"{output_model}/model_dir",
+    "logs_dir": f"{output_model}/logs"
 }
 
 # remove temp dir on exit
-if not par["extra_output"]:
+if not par["output_model"]:
 	import atexit
-	atexit.register(lambda: shutil.rmtree(extra_output))
+	atexit.register(lambda: shutil.rmtree(output_model))
 
 # prepare data
 prepare_data(par, paths)

diff --git a/src/task/methods/first_place/train.py → ...sk/methods/lstm_gru_cnn_ensemble/train.py b/src/task/methods/first_place/train.py → ...sk/methods/lstm_gru_cnn_ensemble/train.py
diff --git a/src/task/methods/third_place/config.vsh.yaml → ...raining_with_pseudolabels/config.vsh.yaml b/src/task/methods/third_place/config.vsh.yaml → ...raining_with_pseudolabels/config.vsh.yaml
@@ -1,26 +1,29 @@
 __merge__: ../../api/comp_method.yaml
 
 functionality:
-  name: third_place
+  name: nn_retraining_with_pseudolabels
   info:
-    label: Third Place
-    summary: Neural networks with pseudolabeling with ensemble modelling
+    label: NN retraining with pseudolabels
+    rank: 3
+    summary: Neural networks with pseudolabeling and ensemble modelling
     description: |
       The prediction system is two staged, so I publish two versions of the notebook.
       The first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third.
       The predicted pseudolabels on all test data (255 rows) are added to training in the second stage.
-      Stage 1 preparing pseudolabels
+      
+      ## Stage 1 preparing pseudolabels
 
       The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized:
       a dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition.
       The optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE.
       Finally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage.
-      Stage 2 retraining with pseudolabels
+      
+      ## Stage 2 retraining with pseudolabels
 
       The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity.
       Optuna selected optimal weights for the linear combination of the prediction again.
       Models had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. 
-    reference: N/A
+    reference: null
     documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458750
     repository_url: https://github.com/okon2000/single_cell_perturbations
 
@@ -56,4 +59,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [ hightime, highmem, highcpu, gpu ]
+      label: [ hightime, midmem, highcpu, gpu ]
diff --git a/src/task/methods/third_place/notebook_264.py → ...raining_with_pseudolabels/notebook_264.py b/src/task/methods/third_place/notebook_264.py → ...raining_with_pseudolabels/notebook_264.py
diff --git a/src/task/methods/third_place/notebook_266.py → ...raining_with_pseudolabels/notebook_266.py b/src/task/methods/third_place/notebook_266.py → ...raining_with_pseudolabels/notebook_266.py
diff --git a/src/task/methods/third_place/script.py → ...nn_retraining_with_pseudolabels/script.py b/src/task/methods/third_place/script.py → ...nn_retraining_with_pseudolabels/script.py
@@ -25,7 +25,7 @@
     "output": "output.parquet",
     "reps": 2,
 }
-meta = {"resources_dir": "src/task/methods/third_place"}
+meta = {"resources_dir": "src/task/methods/nn_retraining_with_pseudolabels"}
 ## VIASH END
 
 def main(par, meta):

diff --git a/src/task/methods/random_forest/config.vsh.yaml b/src/task/methods/random_forest/config.vsh.yaml
diff --git a/src/task/methods/random_forest/script.R b/src/task/methods/random_forest/script.R
diff --git a/src/task/methods/scape/config.vsh.yaml b/src/task/methods/scape/config.vsh.yaml
@@ -3,6 +3,7 @@ functionality:
   name: scape
   info:
     label: ScAPE
+    rank: 16
     summary: Neural network model for drug effect prediction
     description: |
       ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in
@@ -18,12 +19,6 @@ functionality:
     documentation_url: https://docs.google.com/document/d/1w0GIJ8VoQx3HEJNmLXoU-Y_STB-h5-bXusL80_6EVuU/edit
     repository_url: https://github.com/scapeML/scape
   arguments:
-    - type: file
-      name: --output_dir
-      direction: output
-      description: Additional output directory for the model.
-      required: false
-      example: path/to/output_dir
     - type: string
       name: --cell
       description: Pre-defined cell type held for pre-training.

diff --git a/src/task/methods/scape/script.py b/src/task/methods/scape/script.py
@@ -18,7 +18,7 @@
 	de_train_h5ad = "resources/neurips-2023-data/de_train.h5ad",
 	id_map = "resources/neurips-2023-data/id_map.csv",
 	output = "output/neurips-2023-data/output_rf.parquet",
-	output_dir = None,
+	output_model = None,
 	cell = "NK cells",
 	epochs = 2,
 	epochs_enhanced = 2,
@@ -34,11 +34,11 @@
 
 print(f"par: {par}")
 
-# if output_dir is not provided, create a temporary directory
-model_dir = par["output_dir"] or tempfile.TemporaryDirectory(dir = meta["temp_dir"]).name
+# if output_model is not provided, create a temporary directory
+model_dir = par["output_model"] or tempfile.TemporaryDirectory(dir = meta["temp_dir"]).name
 
 # remove temp dir on exit
-if not par["output_dir"]:
+if not par["output_model"]:
 	import atexit
 	atexit.register(lambda: shutil.rmtree(model_dir))
 

diff --git a/src/task/metrics/mean_cosine_sim/config.vsh.yaml b/src/task/metrics/mean_cosine_sim/config.vsh.yaml
@@ -3,7 +3,7 @@ functionality:
   name: mean_cosine_sim
   info:
     metrics:
-      - name: mean_socine_sim
+      - name: mean_cosine_sim
         label: Mean Cosine Similarity
         summary: The mean of cosine similarities per row (perturbation).
         description: |
@@ -13,9 +13,9 @@ functionality:
           \textrm{Mean-Cosine} = \frac{1}{R} \sum_{i=1}^R \frac{\mathbf{y}_i \cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|}
           $$
 
-          where \\(R\\) is the number of scored rows, and \\(\mathbf{y}_i\\) and \\(\mathbf{\hat{y}}_i\\) are the actual and predicted values, respectively, for row \\(i\\).
-        repository_url: "..."
-        documentation_url: "..."
+          where $\\(R\\)$ is the number of scored rows, and $\\(\mathbf{y}_i\\)$ and $\\(\mathbf{\hat{y}}_i\\)$ are the actual and predicted values, respectively, for row $\\(i\\)$.
+        repository_url: null
+        documentation_url: null
         min: -1
         max: 1
         maximize: true

diff --git a/src/task/metrics/mean_rowwise_error/config.vsh.yaml b/src/task/metrics/mean_rowwise_error/config.vsh.yaml
@@ -13,9 +13,9 @@ functionality:
           \textrm{MRRMSE} = \frac{1}{R}\sum\_{i=1}^R\left(\frac{1}{n} \sum\_{j=1}^{n} (y\_{ij} - \widehat{y}\_{ij})^2\right)^{1/2}
           $$
 
-          where \\(R\\) is the number of scored rows, and \\(y_{ij}\\) and \\(\widehat{y}_{ij}\\) are the actual and predicted values, respectively, for row \\(i\\) and column \\(j\\), and \\(n\\) is the number of columns.
-        repository_url: "..."
-        documentation_url: "..."
+          where $\\(R\\)$ is the number of scored rows, and $\\(y_{ij}\\)$ and $\\(\widehat{y}_{ij}\\)$ are the actual and predicted values, respectively, for row $\\(i\\)$ and column $\\(j\\)$, and $\\(n\\)$ bis the number of columns.
+        repository_url: null
+        documentation_url: null
         min: 0
         max: "+inf"
         maximize: false
@@ -29,9 +29,9 @@ functionality:
             \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right)
             $$
           
-            where \(R\) is the number of scored rows, and \(y_{ij}\) and \(\widehat{y}_{ij}\) are the actual and predicted values, respectively, for row \(i\) and column \(j\), and \(n\) is the number of columns.
-        repository_url: "..."
-        documentation_url: "..."
+            where $\\(R\\)$ is the number of scored rows, and $\\(y_{ij}\\)$ and $\\(\widehat{y}_{ij}\\)$ are the actual and predicted values, respectively, for row $\\(i\\)$ and column $\\(j\\)$, and $\\(n\\)$ bis the number of columns.
+        repository_url: null
+        documentation_url: null
         min: 0
         max: "+inf"
         maximize: false

diff --git a/src/task/workflows/run_benchmark/config.vsh.yaml b/src/task/workflows/run_benchmark/config.vsh.yaml
@@ -73,11 +73,10 @@ functionality:
     - name: control_methods/mean_outcome
     - name: control_methods/mean_across_celltypes
     - name: control_methods/mean_across_compounds
-    - name: methods/random_forest
-    - name: methods/third_place
+    - name: methods/nn_retraining_with_pseudolabels
     - name: methods/scape
     - name: methods/jn_ap_op2
-    - name: methods/first_place
+    - name: methods/lstm_gru_cnn_ensemble
     - name: metrics/mean_rowwise_error
     - name: metrics/mean_cosine_sim
   repositories:

diff --git a/src/task/workflows/run_benchmark/main.nf b/src/task/workflows/run_benchmark/main.nf
@@ -19,9 +19,8 @@ workflow run_wf {
     mean_across_compounds,
     sample,
     zeros,
-    random_forest,
-    first_place,
-    third_place,
+    lstm_gru_cnn_ensemble,
+    nn_retraining_with_pseudolabels,
     jn_ap_op2,
     scape
   ]
@@ -78,6 +77,8 @@ workflow run_wf {
           de_train: state.de_train,
           de_train_h5ad: state.de_train_h5ad,
           id_map: state.id_map,
+          output: 'predictions/$id.$key.output.parquet',
+          output_model: null
         ]
         if (comp.config.functionality.info.type == "control_method") {
           new_args.de_test = state.de_test