openproblems-bio · rcannood · Dec 19, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,8 @@
   - `control_methods/shuffle_integration_by_batch`
   - `control_methods/shuffle_integration_by_cell_type`
 
+* Added `metrics/emd_per_samples` component (PR #9).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES

diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml
@@ -1,5 +1,5 @@
 type: file
-example: "resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/score.h5ad"
+example: "resources_test/task_cyto_batch_integration/starter_file/score.h5ad"
 label: Score
 summary: "File indicating the score of a metric."
 info:
@@ -11,12 +11,13 @@ info:
         description: "A unique identifier for the dataset"
         required: true
       - type: string
-        name: normalization_id
-        description: "Which normalization was used"
+        name: method_id
+        description: "A unique identifier for the batch correction method"
         required: true
       - type: string
-        name: method_id
-        description: "A unique identifier for the method"
+        name: sample_ids
+        description: "The samples assessed by the metric"
+        multiple: true
         required: true
       - type: string
         name: metric_ids
@@ -25,6 +26,6 @@ info:
         required: true
       - type: double
         name: metric_values
-        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        description: "The metric values obtained. Must be of same length as 'metric_ids'."
         multiple: true
         required: true
diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml
@@ -0,0 +1,91 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_metric.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: emd_per_samples
+
+# Metadata for your component
+info:
+  metrics:
+      # A unique identifier for your metric (required).
+      # Can contain only lowercase letters or underscores.
+    - name: emd_per_samples
+      # A relatively short label, used when rendering visualisarions (required)
+      label: EMD Per Samples
+      # A one sentence summary of how this metric works (required). Used when 
+      # rendering summary tables.
+      summary: "Earth Mover Distance to compute differences in marker expression across two samples."
+      # A multi-line description of how this component works (required). Used
+      # when rendering reference documentation.
+      description: |
+        Earth Mover Distance (EMD) is a metric designed for comparing two distributions.
+        It is also known as the Wasserstein metric.
+      references:
+        doi: 
+          - 10.1023/A:1026543900054
+      links:
+        # URL to the documentation for this metric (required).
+        documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html
+        # URL to the code repository for this metric (required).
+        repository: https://github.com/TarikExner/CytoNormPy
+      # The minimum possible value for this metric (required)
+      min: 0
+      # The maximum possible value for this metric (required)
+      max: .inf
+      # Whether a higher value represents a 'better' solution (required)
+      maximize: false
+  # Note: need this if we have component specific argument with no default.
+  # When running the actual command, either split the sample name by ;
+  # so Tube1_Batch1_WT;Tube1_Batch2_WT
+  # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT
+  # --samples_to_compare Tube1_Batch2_WT
+  test_setup:
+    starter_file:
+      samples_to_compare: 
+        - Tube1_Batch1_WT
+        - Tube1_Batch2_WT
+
+# Component-specific parameters (optional)
+arguments:
+  - name: "--samples_to_compare"
+    type: "string"
+    description: 2 samples to compare. Separate the sample names by comma
+    required: true
+    multiple: true
+  - name: "--layer"
+    type: "string"
+    default: "integrated"
+    description: The layer in input anndata containing the marker expression
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: python_script
+    path: script.py
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: python
+        packages: [anndata]
+        github: [TarikExner/CytoNormPy]
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py
@@ -0,0 +1,57 @@
+import anndata as ad
+import cytonormpy as cnp
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad",
+    "output": "output.h5ad",
+    "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT",
+    "layer": "integrated",
+}
+meta = {"name": "emd_per_samples"}
+## VIASH END
+
+print("Reading input files", flush=True)
+
+adata = ad.read_h5ad(par["input_integrated"])
+
+samples_to_compare = [x.strip() for x in par["samples_to_compare"]]
+
+layer = par["layer"]
+
+markers_to_assess = adata.var[adata.var["to_correct"]].index.to_numpy()
+
+print("Compute metrics", flush=True)
+
+# have to change the "sample" column to file_name for emd_comparison_from_anndata to work.
+# Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they
+# harcoded the column file_name and use it in assert.
+# See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173
+adata.obs["file_name"] = adata.obs["sample"]
+
+df = cnp.emd_from_anndata(
+    adata=adata,
+    file_list=samples_to_compare,
+    channels=markers_to_assess,
+    layer=layer,
+    sample_identifier_column="file_name",
+)
+
+uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns]
+uns_metric_values = df.loc["all_cells"].to_numpy()
+uns_method_id = adata.uns["method_id"] if "method_id" in adata.uns else "unintegrated"
+
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "method_id": uns_method_id,
+        "sample_ids": samples_to_compare,
+        "metric_ids": uns_metric_ids,
+        "metric_values": uns_metric_values,
+    }
+)
+output.write_h5ad(par["output"], compression="gzip")