openproblems-bio · KaiWaldrant · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -179,3 +179,67 @@
 * `metrics/rmse` should be removed because RMSE metrics don't really make sense here.
 
 * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`.
+
+
+## Multi modality - Joint Embedding
+
+### New functionality
+
+* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
+
+* `api/comp_*`: Created an api definition for the mask, method and metric components.
+
+* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects.
+
+* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`.
+
+### neurips 2021 migration
+
+* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`.
+
+* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`.
+
+* `methods/lmds`: Migrated from neurips 2021.
+
+* `methods/mnn`: Migrated and adapted from neurips 2021.
+
+* `methods/newwave`: Migrated and adapted from neurips 2021.
+
+* `methods/pca`: Migrated from neurips 2021.
+
+* `methods/totalvi`: Migrated from neurips 2021.
+
+* `methods/umap`: Migrated from neurips 2021.
+
+* `metrics/ari`: Migrated from neurips 2021.
+
+* `metrics/asw_batch`: Migrated from neurips 2021.
+
+* `metrics/asw_label`: Migrated from neurips 2021.
+
+* `metrics/cc_cons`: Migrated from neurips 2021.
+
+* `metrics/check_format`: Migrated from neurips 2021.
+
+* `metrics/graph_connectivity`: Migrated from neurips 2021.
+
+* `metrics/latent_mixing`: Migrated from neurips 2021.
+
+* `metrics/nmi`: Migrated from neurips 2021.
+
+* `metrics/rfoob`: Migrated from neurips 2021.
+
+* `metrics/ti_cons`: Migrated from neurips 2021.
+
+* `metrics/ti_cons_batch`: Migrated from neurips 2021.
+
+### changes from neurips 2021
+
+* Updated docker config from R script. Was using an old `anndata` package which was giving warnings
+
+* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
+
+* `X_emb` data is stored as a `Sparse Matrix`
+
+
+* updated configs to latest `viash` 
diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md
@@ -0,0 +1,23 @@
+# Joint embedding
+
+Structure of this task:
+
+    src/embedding
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
@@ -0,0 +1,75 @@
+type: file
+description: "A raw dataset"
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: false
+      - type: string
+        name: sample_pm_varnames
+        description:
+        required: false
+
diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -0,0 +1,37 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml
@@ -0,0 +1,39 @@
+type: file
+description: "The masked data for mod2 file"
+example: "masked.h5ad"
+info:
+  short_description: "Masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: string
+        name: feature_types
+        description:
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    obsm:
+      - type: double
+        name: gene_activity
+        description: 
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "The prediction file"
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    obsm:
+      - type: double
+        name: X_emb
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml
@@ -0,0 +1,57 @@
+type: file
+description: "The solution for the data"
+example: "solution.h5ad"
+info:
+  short_description: "Solution"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
@@ -0,0 +1,24 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
+    - name: Christopher Lance
+      email: clance.connect@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: xlancelottx }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Ann Chen
+      email: ann.chen@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: atchen}