Skip to content

Commit bc76a3f

Browse files
Backport #653 to 0.12.x (#656)
* Concatenation: fix order of var_names in concatenated object. (#653) * Typo in CHANGELOG * Update CI workflows * deploy: 47de7d8 * Update CHANGELOG * Revert "Update CI workflows" This reverts commit 47de7d8. --------- Co-authored-by: DriesSchaumont <DriesSchaumont@users.noreply.github.com>
1 parent bb0a94b commit bc76a3f

File tree

420 files changed

+1878
-1306
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

420 files changed

+1878
-1306
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
# openpipelines 0.12.2
2+
3+
## BUG FIXES
4+
5+
* `dataflow/concat` and `dataflow/concatenate_h5mu`: Fix an issue where using `--mode move` on samples with non-overlapping features would cause `var_names` to become unaligned to the data (PR #653).
6+
17
# openpipelines 0.12.1
28

39
## BUG FIXES

src/dataflow/concat/config.vsh.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ platforms:
7676
test_setup:
7777
- type: python
7878
__merge__: [ /src/base/requirements/viashpy.yaml, .]
79+
packages:
80+
- muon
7981
- type: native
8082
- type: nextflow
8183
directives:

src/dataflow/concat/script.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) ->
121121
is_duplicated = pool.map(nunique, iter(numpy_array))
122122
return any(is_duplicated)
123123

124-
def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iterable[pd.DataFrame]) \
124+
def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iterable[pd.DataFrame], align_to: pd.Index | None) \
125125
-> tuple[dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype]]:
126126
"""
127127
Merge matrices by combining columns that have the same name.
@@ -131,12 +131,13 @@ def concatenate_matrices(n_processes: int, input_ids: tuple[str], matrices: Iter
131131
column_names = set(column_name for var in matrices for column_name in var)
132132
logger.debug('Trying to concatenate columns: %s.', ",".join(column_names))
133133
if not column_names:
134-
return {}, None
134+
return {}, pd.DataFrame(index=align_to)
135135
conflicts, concatenated_matrix = \
136136
split_conflicts_and_concatenated_columns(n_processes,
137137
input_ids,
138138
matrices,
139-
column_names)
139+
column_names,
140+
align_to)
140141
concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix)
141142
conflicts = {conflict_name: cast_to_writeable_dtype(conflict_df)
142143
for conflict_name, conflict_df in conflicts.items()}
@@ -152,7 +153,8 @@ def get_first_non_na_value_vector(df):
152153
def split_conflicts_and_concatenated_columns(n_processes: int,
153154
input_ids: tuple[str],
154155
matrices: Iterable[pd.DataFrame],
155-
column_names: Iterable[str]) -> \
156+
column_names: Iterable[str],
157+
align_to: pd.Index | None = None) -> \
156158
tuple[dict[str, pd.DataFrame], pd.DataFrame]:
157159
"""
158160
Retrieve columns with the same name from a list of dataframes which are
@@ -166,19 +168,21 @@ def split_conflicts_and_concatenated_columns(n_processes: int,
166168
for column_name in column_names:
167169
columns = [var[column_name] for var in matrices if column_name in var]
168170
assert columns, "Some columns should have been found."
169-
concatenated_columns = pd.concat(columns, axis=1, join="outer")
171+
concatenated_columns = pd.concat(columns, axis=1, join="outer", sort=False)
170172
if any_row_contains_duplicate_values(n_processes, concatenated_columns):
171173
concatenated_columns.columns = input_ids
174+
if align_to is not None:
175+
concatenated_columns = concatenated_columns.reindex(align_to, copy=False)
172176
conflicts[f'conflict_{column_name}'] = concatenated_columns
173177
else:
174178
unique_values = get_first_non_na_value_vector(concatenated_columns)
175-
# concatenated_columns.fillna(method='bfill', axis=1).iloc[:, 0]
176179
concatenated_matrix.append(unique_values)
177-
if concatenated_matrix:
178-
concatenated_matrix = pd.concat(concatenated_matrix, join="outer", axis=1)
179-
else:
180-
concatenated_matrix = pd.DataFrame()
181-
180+
if not concatenated_matrix:
181+
return conflicts, pd.DataFrame(index=align_to)
182+
concatenated_matrix = pd.concat(concatenated_matrix, join="outer",
183+
axis=1, sort=False)
184+
if align_to is not None:
185+
concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False)
182186
return conflicts, concatenated_matrix
183187

184188
def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame:
@@ -213,15 +217,17 @@ def split_conflicts_modalities(n_processes: int, input_ids: tuple[str], samples:
213217
matrices_to_parse = ("var", "obs")
214218
for matrix_name in matrices_to_parse:
215219
matrices = [getattr(sample, matrix_name) for sample in samples]
216-
conflicts, concatenated_matrix = concatenate_matrices(n_processes, input_ids, matrices)
217-
220+
output_index = getattr(output, matrix_name).index
221+
align_to = output_index if matrix_name == "var" else None
222+
conflicts, concatenated_matrix = concatenate_matrices(n_processes, input_ids, matrices, align_to)
223+
if concatenated_matrix.empty:
224+
concatenated_matrix.index = output_index
218225
# Write the conflicts to the output
219-
matrix_index = getattr(output, matrix_name).index
220226
for conflict_name, conflict_data in conflicts.items():
221-
getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data.reindex(matrix_index)
227+
getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data
222228

223229
# Set other annotation matrices in the output
224-
setattr(output, matrix_name, pd.DataFrame() if concatenated_matrix is None else concatenated_matrix)
230+
setattr(output, matrix_name, concatenated_matrix)
225231

226232
return output
227233

src/dataflow/concat/test.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77
import re
88
import sys
9+
import muon
910

1011
## VIASH START
1112
meta = {
@@ -25,6 +26,14 @@
2526
input_sample1_file = f"{meta['resources_dir']}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
2627
input_sample2_file = f"{meta['resources_dir']}/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
2728

29+
@pytest.fixture
30+
def anndata_to_sparse_dataframe():
31+
def wrapper(anndata_object):
32+
return pd.DataFrame.sparse.from_spmatrix(anndata_object.X,
33+
index=anndata_object.obs_names,
34+
columns=anndata_object.var_names)
35+
return wrapper
36+
2837
@pytest.fixture
2938
def mudata_without_genome(tmp_path, request):
3039
mudatas_to_change, modalities = request.param
@@ -39,6 +48,7 @@ def mudata_without_genome(tmp_path, request):
3948
# (here atac:genome) next to the old 'column_name' (here just 'genome')
4049
new_mudata.update_var()
4150
new_mudata.var.drop('genome', axis=1, inplace=True)
51+
new_mudata = new_mudata[0:500,] # subsample to reduce memory consumption
4252
new_path = tmp_path / Path(mudata_to_change).name
4353
new_mudata.write(new_path, compression="gzip")
4454
result.append(new_path)
@@ -345,6 +355,40 @@ def test_concat_invalid_h5_error_includes_path(run_component, tmp_path):
345355
])
346356
assert re.search(rf"OSError: Failed to load .*{str(empty_file)}\. Is it a valid h5 file?",
347357
err.value.stdout.decode('utf-8'))
358+
359+
360+
@pytest.mark.parametrize("mudata_without_genome",
361+
[([input_sample1_file], ["rna", "atac"])],
362+
indirect=["mudata_without_genome"])
363+
def test_concat_var_obs_names_order(run_component, mudata_without_genome,
364+
anndata_to_sparse_dataframe):
365+
"""
366+
Test that the var_names and obs_names are still linked to the correct count data.
367+
"""
368+
[sample1_without_genome,] = mudata_without_genome
369+
run_component([
370+
"--input_id", "mouse,human",
371+
"--input", sample1_without_genome,
372+
"--input", input_sample2_file,
373+
"--output", "concat.h5mu",
374+
"--other_axis_mode", "move"
375+
])
376+
assert Path("concat.h5mu").is_file() is True
377+
for sample_name, sample_path in {"mouse": sample1_without_genome,
378+
"human": input_sample2_file}.items():
379+
for mod_name in ["rna", "atac"]:
380+
data_sample = md.read_h5ad(sample_path, mod=mod_name)
381+
processed_data = md.read_h5ad("concat.h5mu", mod=mod_name)
382+
muon.pp.filter_obs(processed_data, 'sample_id', lambda x: x == sample_name)
383+
muon.pp.filter_var(processed_data, data_sample.var_names)
384+
data_sample_to_test = anndata_to_sparse_dataframe(data_sample)
385+
processed_data_to_test = anndata_to_sparse_dataframe(processed_data)
386+
data_sample_to_test = data_sample_to_test.reindex_like(processed_data_to_test)
387+
pd.testing.assert_index_equal(data_sample_to_test.columns, processed_data_to_test.columns)
388+
pd.testing.assert_index_equal(data_sample_to_test.index, processed_data_to_test.index)
389+
for (_, col1), (_, col2) in zip(data_sample_to_test.items(), processed_data_to_test.items()):
390+
pd._testing.assert_sp_array_equal(col1.array, col2.array)
391+
348392

349393
if __name__ == '__main__':
350394
sys.exit(pytest.main([__file__, "-v"]))

target/docker/annotate/popv/.config.vsh.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
functionality:
22
name: "popv"
33
namespace: "annotate"
4-
version: "0.12.0"
4+
version: "0.12.2"
55
authors:
66
- name: "Matthias Beyens"
77
roles:
@@ -247,6 +247,7 @@ platforms:
247247
image: "python:3.9-slim"
248248
target_organization: "openpipelines-bio"
249249
target_registry: "ghcr.io"
250+
target_tag: "0.12.0"
250251
namespace_separator: "_"
251252
resolve_volume: "Automatic"
252253
chown: true
@@ -340,5 +341,6 @@ info:
340341
output: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv"
341342
executable: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv/popv"
342343
viash_version: "0.7.5"
343-
git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
344+
git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
344345
git_remote: "https://github.com/openpipelines-bio/openpipeline"
346+
git_tag: "0.12.1-3-g47de7d80f3"

target/docker/annotate/popv/popv

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22

3-
# popv 0.12.0
3+
# popv 0.12.2
44
#
55
# This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
66
# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -159,7 +159,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
159159

160160
# ViashHelp: Display helpful explanation about this executable
161161
function ViashHelp {
162-
echo "popv 0.12.0"
162+
echo "popv 0.12.2"
163163
echo ""
164164
echo "Performs popular major vote cell typing on single cell sequence data using"
165165
echo "multiple algorithms. Note that this is a one-shot version of PopV."
@@ -488,10 +488,10 @@ RUN cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \
488488
489489
LABEL org.opencontainers.image.authors="Matthias Beyens, Robrecht Cannoodt"
490490
LABEL org.opencontainers.image.description="Companion container for running component annotate popv"
491-
LABEL org.opencontainers.image.created="2023-10-24T07:38:06Z"
491+
LABEL org.opencontainers.image.created="2024-01-17T13:29:10Z"
492492
LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
493-
LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
494-
LABEL org.opencontainers.image.version="0.12.0"
493+
LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
494+
LABEL org.opencontainers.image.version="0.12.2"
495495
496496
VIASHDOCKER
497497
}
@@ -642,7 +642,7 @@ while [[ $# -gt 0 ]]; do
642642
shift 1
643643
;;
644644
--version)
645-
echo "popv 0.12.0"
645+
echo "popv 0.12.2"
646646
exit
647647
;;
648648
--input)

target/docker/cluster/leiden/.config.vsh.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
functionality:
22
name: "leiden"
33
namespace: "cluster"
4-
version: "0.12.0"
4+
version: "0.12.2"
55
authors:
66
- name: "Dries De Maeyer"
77
roles:
@@ -140,6 +140,7 @@ platforms:
140140
image: "python:3.8-slim"
141141
target_organization: "openpipelines-bio"
142142
target_registry: "ghcr.io"
143+
target_tag: "0.12.0"
143144
namespace_separator: "_"
144145
resolve_volume: "Automatic"
145146
chown: true
@@ -213,5 +214,6 @@ info:
213214
output: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden"
214215
executable: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden/leiden"
215216
viash_version: "0.7.5"
216-
git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
217+
git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
217218
git_remote: "https://github.com/openpipelines-bio/openpipeline"
219+
git_tag: "0.12.1-3-g47de7d80f3"

target/docker/cluster/leiden/leiden

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22

3-
# leiden 0.12.0
3+
# leiden 0.12.2
44
#
55
# This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
66
# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -158,7 +158,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
158158

159159
# ViashHelp: Display helpful explanation about this executable
160160
function ViashHelp {
161-
echo "leiden 0.12.0"
161+
echo "leiden 0.12.2"
162162
echo ""
163163
echo "Cluster cells using the Leiden algorithm [Traag18] implemented in the Scanpy"
164164
echo "framework [Wolf18]."
@@ -445,10 +445,10 @@ RUN pip install --upgrade pip && \
445445
446446
LABEL org.opencontainers.image.authors="Dries De Maeyer"
447447
LABEL org.opencontainers.image.description="Companion container for running component cluster leiden"
448-
LABEL org.opencontainers.image.created="2023-10-24T07:38:06Z"
448+
LABEL org.opencontainers.image.created="2024-01-17T13:29:08Z"
449449
LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
450-
LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
451-
LABEL org.opencontainers.image.version="0.12.0"
450+
LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
451+
LABEL org.opencontainers.image.version="0.12.2"
452452
453453
VIASHDOCKER
454454
}
@@ -599,7 +599,7 @@ while [[ $# -gt 0 ]]; do
599599
shift 1
600600
;;
601601
--version)
602-
echo "leiden 0.12.0"
602+
echo "leiden 0.12.2"
603603
exit
604604
;;
605605
--input)

target/docker/compression/compress_h5mu/.config.vsh.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
functionality:
22
name: "compress_h5mu"
33
namespace: "compression"
4-
version: "0.12.0"
4+
version: "0.12.2"
55
authors:
66
- name: "Dries Schaumont"
77
roles:
@@ -83,6 +83,7 @@ platforms:
8383
image: "python:3.10-slim"
8484
target_organization: "openpipelines-bio"
8585
target_registry: "ghcr.io"
86+
target_tag: "0.12.0"
8687
namespace_separator: "_"
8788
resolve_volume: "Automatic"
8889
chown: true
@@ -161,5 +162,6 @@ info:
161162
output: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu"
162163
executable: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu/compress_h5mu"
163164
viash_version: "0.7.5"
164-
git_commit: "b41a65886f61959c7f83f84a0569c3da1d7a2856"
165+
git_commit: "47de7d80f389f495616a550f90ab36c5722d66c7"
165166
git_remote: "https://github.com/openpipelines-bio/openpipeline"
167+
git_tag: "0.12.1-3-g47de7d80f3"

target/docker/compression/compress_h5mu/compress_h5mu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22

3-
# compress_h5mu 0.12.0
3+
# compress_h5mu 0.12.2
44
#
55
# This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
66
# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -158,7 +158,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
158158

159159
# ViashHelp: Display helpful explanation about this executable
160160
function ViashHelp {
161-
echo "compress_h5mu 0.12.0"
161+
echo "compress_h5mu 0.12.2"
162162
echo ""
163163
echo "Compress a MuData file."
164164
echo ""
@@ -408,10 +408,10 @@ RUN pip install --upgrade pip && \
408408
409409
LABEL org.opencontainers.image.authors="Dries Schaumont"
410410
LABEL org.opencontainers.image.description="Companion container for running component compression compress_h5mu"
411-
LABEL org.opencontainers.image.created="2023-10-24T07:38:11Z"
411+
LABEL org.opencontainers.image.created="2024-01-17T13:29:08Z"
412412
LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
413-
LABEL org.opencontainers.image.revision="b41a65886f61959c7f83f84a0569c3da1d7a2856"
414-
LABEL org.opencontainers.image.version="0.12.0"
413+
LABEL org.opencontainers.image.revision="47de7d80f389f495616a550f90ab36c5722d66c7"
414+
LABEL org.opencontainers.image.version="0.12.2"
415415
416416
VIASHDOCKER
417417
}
@@ -562,7 +562,7 @@ while [[ $# -gt 0 ]]; do
562562
shift 1
563563
;;
564564
--version)
565-
echo "compress_h5mu 0.12.0"
565+
echo "compress_h5mu 0.12.2"
566566
exit
567567
;;
568568
--input)

0 commit comments

Comments
 (0)