openpipelines-bio
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/dataflow/concat/config.vsh.yaml‎
Lines changed: 2 additions & 0 deletions b/‎src/dataflow/concat/config.vsh.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/dataflow/concat/script.py‎
Lines changed: 22 additions & 16 deletions b/‎src/dataflow/concat/script.py‎
Lines changed: 22 additions & 16 deletions
diff --git a/‎src/dataflow/concat/test.py‎
Lines changed: 44 additions & 0 deletions b/‎src/dataflow/concat/test.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎target/docker/annotate/popv/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions b/‎target/docker/annotate/popv/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎target/docker/annotate/popv/popv‎
Lines changed: 6 additions & 6 deletions b/‎target/docker/annotate/popv/popv‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎target/docker/cluster/leiden/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions b/‎target/docker/cluster/leiden/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎target/docker/cluster/leiden/leiden‎
Lines changed: 6 additions & 6 deletions b/‎target/docker/cluster/leiden/leiden‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎target/docker/compression/compress_h5mu/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions b/‎target/docker/compression/compress_h5mu/.config.vsh.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎target/docker/compression/compress_h5mu/compress_h5mu‎
Lines changed: 6 additions & 6 deletions b/‎target/docker/compression/compress_h5mu/compress_h5mu‎
Lines changed: 6 additions & 6 deletions
@@ -1,3 +1,9 @@
+# openpipelines 0.12.2
+
+## BUG FIXES
+
+* `dataflow/concat` and `dataflow/concatenate_h5mu`: Fix an issue where using `--mode move` on samples with non-overlapping features would cause `var_names` to become unaligned to the data (PR #653). 
+
 # openpipelines 0.12.1
 
 ## BUG FIXES
 
@@ -76,6 +76,8 @@ platforms:
     test_setup:
       - type: python
         __merge__: [ /src/base/requirements/viashpy.yaml, .]
+        packages:
+          - muon
   - type: native
   - type: nextflow
     directives:
 
@@ -121,7 +121,7 @@ def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) ->
         is_duplicated = pool.map(nunique, iter(numpy_array))
     return any(is_duplicated)
 
-def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iterable[pd.DataFrame]) \
+def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iterable[pd.DataFrame], align_to: pd.Index | None) \
     -> tuple[dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype]]:
     """
     Merge matrices by combining columns that have the same name.
@@ -131,12 +131,13 @@ def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iter
     column_names = set(column_name for var in matrices for column_name in var)
     logger.debug('Trying to concatenate columns: %s.', ",".join(column_names))
     if not column_names:
-        return {}, None
+        return {}, pd.DataFrame(index=align_to)
     conflicts, concatenated_matrix = \
         split_conflicts_and_concatenated_columns(n_processes,
                                                  input_ids,
                                                  matrices,
-                                                 column_names)
+                                                 column_names,
+                                                 align_to)
     concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix)
     conflicts = {conflict_name: cast_to_writeable_dtype(conflict_df) 
                  for conflict_name, conflict_df in conflicts.items()}
@@ -152,7 +153,8 @@ def get_first_non_na_value_vector(df):
 def split_conflicts_and_concatenated_columns(n_processes: int,
                                              input_ids: tuple[str],
                                              matrices: Iterable[pd.DataFrame],
-                                             column_names: Iterable[str]) -> \
+                                             column_names: Iterable[str],
+                                             align_to: pd.Index | None = None) -> \
                                             tuple[dict[str, pd.DataFrame], pd.DataFrame]:
     """
     Retrieve columns with the same name from a list of dataframes which are
@@ -166,19 +168,21 @@ def split_conflicts_and_concatenated_columns(n_processes: int,
     for column_name in column_names:
         columns = [var[column_name] for var in matrices if column_name in var]
         assert columns, "Some columns should have been found."
-        concatenated_columns = pd.concat(columns, axis=1, join="outer")
+        concatenated_columns = pd.concat(columns, axis=1, join="outer", sort=False)
         if any_row_contains_duplicate_values(n_processes, concatenated_columns):
             concatenated_columns.columns = input_ids
+            if align_to is not None:
+                concatenated_columns = concatenated_columns.reindex(align_to, copy=False)
             conflicts[f'conflict_{column_name}'] = concatenated_columns
         else:
             unique_values = get_first_non_na_value_vector(concatenated_columns)
-            # concatenated_columns.fillna(method='bfill', axis=1).iloc[:, 0]
             concatenated_matrix.append(unique_values)
-    if concatenated_matrix:
-        concatenated_matrix = pd.concat(concatenated_matrix, join="outer", axis=1)
-    else:
-        concatenated_matrix = pd.DataFrame()
-
+    if not concatenated_matrix:
+        return conflicts, pd.DataFrame(index=align_to)
+    concatenated_matrix = pd.concat(concatenated_matrix, join="outer",
+                                    axis=1, sort=False)
+    if align_to is not None:
+        concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False)
     return conflicts, concatenated_matrix
 
 def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame:
@@ -213,15 +217,17 @@ def split_conflicts_modalities(n_processes: int, input_ids: tuple[str], samples:
     matrices_to_parse = ("var", "obs")
     for matrix_name in matrices_to_parse:
         matrices = [getattr(sample, matrix_name) for sample in samples]
-        conflicts, concatenated_matrix = concatenate_matrices(n_processes, input_ids, matrices)
-        
+        output_index = getattr(output, matrix_name).index 
+        align_to = output_index if matrix_name == "var" else None
+        conflicts, concatenated_matrix = concatenate_matrices(n_processes, input_ids, matrices, align_to)
+        if concatenated_matrix.empty:
+           concatenated_matrix.index = output_index 
         # Write the conflicts to the output
-        matrix_index = getattr(output, matrix_name).index
         for conflict_name, conflict_data in conflicts.items():
-            getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data.reindex(matrix_index)
+            getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data
 
         # Set other annotation matrices in the output
-        setattr(output, matrix_name, pd.DataFrame() if concatenated_matrix is None else concatenated_matrix)
+        setattr(output, matrix_name, concatenated_matrix)
 
     return output
 
 
@@ -6,6 +6,7 @@
 import pytest
 import re
 import sys
+import muon
 
 ## VIASH START
 meta = {
@@ -25,6 +26,14 @@
 input_sample1_file = f"{meta['resources_dir']}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
 input_sample2_file = f"{meta['resources_dir']}/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
 
+@pytest.fixture
+def anndata_to_sparse_dataframe():
+    def wrapper(anndata_object):
+        return pd.DataFrame.sparse.from_spmatrix(anndata_object.X,
+                                                 index=anndata_object.obs_names, 
+                                                 columns=anndata_object.var_names)
+    return wrapper
+
 @pytest.fixture
 def mudata_without_genome(tmp_path, request):
     mudatas_to_change, modalities = request.param
@@ -39,6 +48,7 @@ def mudata_without_genome(tmp_path, request):
         # (here atac:genome) next to the old 'column_name' (here just 'genome')
         new_mudata.update_var()
         new_mudata.var.drop('genome', axis=1, inplace=True)
+        new_mudata = new_mudata[0:500,] # subsample to reduce memory consumption
         new_path = tmp_path / Path(mudata_to_change).name
         new_mudata.write(new_path, compression="gzip")
         result.append(new_path)
@@ -345,6 +355,40 @@ def test_concat_invalid_h5_error_includes_path(run_component, tmp_path):
                 ])
         assert re.search(rf"OSError: Failed to load .*{str(empty_file)}\. Is it a valid h5 file?",
             err.value.stdout.decode('utf-8'))
+        
+
+@pytest.mark.parametrize("mudata_without_genome",
+                          [([input_sample1_file], ["rna", "atac"])],
+                          indirect=["mudata_without_genome"])
+def test_concat_var_obs_names_order(run_component, mudata_without_genome, 
+                                    anndata_to_sparse_dataframe):
+    """
+    Test that the var_names and obs_names are still linked to the correct count data.
+    """
+    [sample1_without_genome,] = mudata_without_genome
+    run_component([
+            "--input_id", "mouse,human",
+            "--input", sample1_without_genome,
+            "--input", input_sample2_file,
+            "--output", "concat.h5mu",
+            "--other_axis_mode", "move"
+            ])
+    assert Path("concat.h5mu").is_file() is True
+    for sample_name, sample_path in {"mouse": sample1_without_genome, 
+                                     "human": input_sample2_file}.items():
+        for mod_name in ["rna", "atac"]:
+            data_sample = md.read_h5ad(sample_path, mod=mod_name)
+            processed_data = md.read_h5ad("concat.h5mu", mod=mod_name)
+            muon.pp.filter_obs(processed_data, 'sample_id', lambda x: x == sample_name)
+            muon.pp.filter_var(processed_data, data_sample.var_names)
+            data_sample_to_test = anndata_to_sparse_dataframe(data_sample)
+            processed_data_to_test = anndata_to_sparse_dataframe(processed_data)
+            data_sample_to_test = data_sample_to_test.reindex_like(processed_data_to_test)
+            pd.testing.assert_index_equal(data_sample_to_test.columns, processed_data_to_test.columns)
+            pd.testing.assert_index_equal(data_sample_to_test.index, processed_data_to_test.index)
+            for (_, col1), (_, col2) in zip(data_sample_to_test.items(), processed_data_to_test.items()):
+                pd._testing.assert_sp_array_equal(col1.array, col2.array)
+            
 
 if __name__ == '__main__':
     sys.exit(pytest.main([__file__, "-v"]))
@@ -1,7 +1,7 @@
 functionality:
   name: "popv"
   namespace: "annotate"
-  version: "0.12.0"
+  version: "0.12.2"
   authors:
   - name: "Matthias Beyens"
     roles:
@@ -247,6 +247,7 @@ platforms:
   image: "python:3.9-slim"
   target_organization: "openpipelines-bio"
   target_registry: "ghcr.io"
+  target_tag: "0.12.0"
   namespace_separator: "_"
   resolve_volume: "Automatic"
   chown: true
@@ -340,5 +341,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv/popv"
   viash_version: "0.7.5"
-  git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
+  git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.12.1-3-g47de7d80f3"
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# popv 0.12.0
+# popv 0.12.2
 # 
 # This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -159,7 +159,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "popv 0.12.0"
+  echo "popv 0.12.2"
   echo ""
   echo "Performs popular major vote cell typing on single cell sequence data using"
   echo "multiple algorithms. Note that this is a one-shot version of PopV."
@@ -488,10 +488,10 @@ RUN cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \
 
 LABEL org.opencontainers.image.authors="Matthias Beyens, Robrecht Cannoodt"
 LABEL org.opencontainers.image.description="Companion container for running component annotate popv"
-LABEL org.opencontainers.image.created="2023-10-24T07:38:06Z"
+LABEL org.opencontainers.image.created="2024-01-17T13:29:10Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
-LABEL org.opencontainers.image.version="0.12.0"
+LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
+LABEL org.opencontainers.image.version="0.12.2"
 
 VIASHDOCKER
 }
@@ -642,7 +642,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "popv 0.12.0"
+            echo "popv 0.12.2"
             exit
             ;;
         --input)
 
@@ -1,7 +1,7 @@
 functionality:
   name: "leiden"
   namespace: "cluster"
-  version: "0.12.0"
+  version: "0.12.2"
   authors:
   - name: "Dries De Maeyer"
     roles:
@@ -140,6 +140,7 @@ platforms:
   image: "python:3.8-slim"
   target_organization: "openpipelines-bio"
   target_registry: "ghcr.io"
+  target_tag: "0.12.0"
   namespace_separator: "_"
   resolve_volume: "Automatic"
   chown: true
@@ -213,5 +214,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden/leiden"
   viash_version: "0.7.5"
-  git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
+  git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.12.1-3-g47de7d80f3"
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# leiden 0.12.0
+# leiden 0.12.2
 # 
 # This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -158,7 +158,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "leiden 0.12.0"
+  echo "leiden 0.12.2"
   echo ""
   echo "Cluster cells using the Leiden algorithm [Traag18] implemented in the Scanpy"
   echo "framework [Wolf18]."
@@ -445,10 +445,10 @@ RUN pip install --upgrade pip && \
 
 LABEL org.opencontainers.image.authors="Dries De Maeyer"
 LABEL org.opencontainers.image.description="Companion container for running component cluster leiden"
-LABEL org.opencontainers.image.created="2023-10-24T07:38:06Z"
+LABEL org.opencontainers.image.created="2024-01-17T13:29:08Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
-LABEL org.opencontainers.image.version="0.12.0"
+LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
+LABEL org.opencontainers.image.version="0.12.2"
 
 VIASHDOCKER
 }
@@ -599,7 +599,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "leiden 0.12.0"
+            echo "leiden 0.12.2"
             exit
             ;;
         --input)
 
@@ -1,7 +1,7 @@
 functionality:
   name: "compress_h5mu"
   namespace: "compression"
-  version: "0.12.0"
+  version: "0.12.2"
   authors:
   - name: "Dries Schaumont"
     roles:
@@ -83,6 +83,7 @@ platforms:
   image: "python:3.10-slim"
   target_organization: "openpipelines-bio"
   target_registry: "ghcr.io"
+  target_tag: "0.12.0"
   namespace_separator: "_"
   resolve_volume: "Automatic"
   chown: true
@@ -161,5 +162,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu/compress_h5mu"
   viash_version: "0.7.5"
-  git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
+  git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.12.1-3-g47de7d80f3"
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# compress_h5mu 0.12.0
+# compress_h5mu 0.12.2
 # 
 # This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -158,7 +158,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "compress_h5mu 0.12.0"
+  echo "compress_h5mu 0.12.2"
   echo ""
   echo "Compress a MuData file."
   echo ""
@@ -408,10 +408,10 @@ RUN pip install --upgrade pip && \
 
 LABEL org.opencontainers.image.authors="Dries Schaumont"
 LABEL org.opencontainers.image.description="Companion container for running component compression compress_h5mu"
-LABEL org.opencontainers.image.created="2023-10-24T07:38:11Z"
+LABEL org.opencontainers.image.created="2024-01-17T13:29:08Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
-LABEL org.opencontainers.image.version="0.12.0"
+LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
+LABEL org.opencontainers.image.version="0.12.2"
 
 VIASHDOCKER
 }
@@ -562,7 +562,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "compress_h5mu 0.12.0"
+            echo "compress_h5mu 0.12.2"
             exit
             ;;
         --input)