From f1fc159a469f10848bb35c9a1562a53e6d56905c Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 17 Dec 2021 19:15:44 +1100 Subject: [PATCH] Find objects bounding boxes (#240) * Alternative find_objects implementation * Dask docs suggest bag.fold is more efficient than bag.reduction * Merge dataframes instead of using combine * Make find_objects output obviously a dask dataframe * A clearer way to use dask delayed * Try to clarify dataframe column naming convention * Additional clarifying comment * Default step size for slices is None * Use Marvin's suggestion so we don't have to call compute twice on result * Revert delayed changes * Be consistent with slice step (use default value) * Remove redundant import statement * Avoid user having to call compute twice on result * Fix delayed so we know output is a dask dataframe * Use functools partial to pass in array dimension information to _find_objects * Improve docstrings in _find_objects.py * Add check for integer array dtype in find_objects * Re-trigger CI * Improve find_objects docstring re return value * find_objects _merge_bounding_boxes, clarify comment * Fix find_objects bug where chunk has no non-zero labels * Add test for find_objects with empty chunk in array --- dask_image/ndmeasure/__init__.py | 47 ++++++++++ dask_image/ndmeasure/_utils/_find_objects.py | 75 ++++++++++++++++ .../test_ndmeasure/test_find_objects.py | 87 +++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 dask_image/ndmeasure/_utils/_find_objects.py create mode 100644 tests/test_dask_image/test_ndmeasure/test_find_objects.py diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 9213f16f..505603de 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -4,12 +4,16 @@ import functools import operator import warnings +from dask import compute, delayed import dask.array as da +import dask.bag as db +import dask.dataframe as dd import numpy as np from . import _utils from ._utils import _label +from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _find_objects __all__ = [ "area", @@ -202,6 +206,49 @@ def extrema(image, label_image=None, index=None): return result +def find_objects(label_image): + """Return bounding box slices for each object labelled by integers. + + Parameters + ---------- + label_image : ndarray + Image features noted by integers. + + Returns + ------- + Dask dataframe + Each row respresents an indivdual integrer label. Columns contain the + slice information for the object boundaries in each dimension + (dimensions are named: 0, 1, ..., nd). + + Notes + ----- + You must have the optional dependency ``dask[dataframe]`` installed + to use the ``find_objects`` function. + """ + if label_image.dtype.char not in np.typecodes['AllInteger']: + raise ValueError("find_objects only accepts integer dtype arrays") + + block_iter = zip( + np.ndindex(*label_image.numblocks), + map(functools.partial(operator.getitem, label_image), + da.core.slices_from_chunks(label_image.chunks)) + ) + + arrays = [] + for block_id, block in block_iter: + array_location = _array_chunk_location(block_id, label_image.chunks) + arrays.append(delayed(_find_bounding_boxes)(block, array_location)) + + bag = db.from_sequence(arrays) + result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed() + meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) + result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result + result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) + + return result + + def histogram(image, min, max, diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py new file mode 100644 index 00000000..e9a24ab9 --- /dev/null +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd +from dask.delayed import Delayed +import dask.dataframe as dd + + +def _array_chunk_location(block_id, chunks): + """Pixel coordinate of top left corner of the array chunk.""" + array_location = [] + for idx, chunk in zip(block_id, chunks): + array_location.append(sum(chunk[:idx])) + return tuple(array_location) + + +def _find_bounding_boxes(x, array_location): + """An alternative to scipy.ndimage.find_objects. + + We use this alternative because scipy.ndimage.find_objects + returns a tuple of length N, where N is the largest integer label. + This is not ideal for distributed labels, where there might be only + one or two objects in an image chunk labelled with very large integers. + + This alternative function returns a pandas dataframe, + with one row per object found in the image chunk. + """ + unique_vals = np.unique(x) + unique_vals = unique_vals[unique_vals != 0] + result = {} + for val in unique_vals: + positions = np.where(x == val) + slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions)) + result[val] = slices + column_names = [i for i in range(x.ndim)] # column names are: 0, 1, ... nD + return pd.DataFrame.from_dict(result, orient='index', columns=column_names) + + +def _combine_slices(slices): + "Return the union of all slices." + if len(slices) == 1: + return slices[0] + else: + start = min([sl.start for sl in slices]) + stop = max([sl.stop for sl in slices]) + return slice(start, stop) + + +def _merge_bounding_boxes(x, ndim): + """Merge the bounding boxes describing objects over multiple image chunks.""" + x = x.dropna() + data = {} + # For each dimension in the array, + # pick out the slice values belonging to that dimension + # and combine the slices + # (i.e. find the union; the slice expanded to all input slices). + for i in range(ndim): + # Array dimensions are labelled by a number followed by an underscroe + # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ... + # (x and y represent the pair of chunks label slices are merged from) + slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))] + combined_slices = _combine_slices(slices) + data[i] = combined_slices + result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name) + return result + + +def _find_objects(ndim, df1, df2): + """Main utility function for find_objects.""" + meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) + if isinstance(df1, Delayed): + df1 = dd.from_delayed(df1, meta=meta) + if isinstance(df2, Delayed): + df2 = dd.from_delayed(df2, meta=meta) + ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) + result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta) + return result diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py new file mode 100644 index 00000000..4f78d598 --- /dev/null +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -0,0 +1,87 @@ +from dask_image.ndmeasure._utils import _labeled_comprehension_delayed +import dask.array as da +import dask.dataframe as dd +import numpy as np +import pandas as pd +import pytest + +import dask_image.ndmeasure + + +@pytest.fixture +def label_image(): + """Return small label image for tests. + + dask.array + + array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333], + [111, 111, 0, 0, 0, 0, 0, 333, 333, 333], + [111, 111, 0, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 222, 222, 222, 222, 222, 222, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + """ + label_image = np.zeros((5, 10)).astype(int) + label_image[1:3,0:2] = 111 + label_image[3,3:-2] = 222 + label_image[0:2,-3:] = 333 + label_image = da.from_array(label_image, chunks=(5, 5)) + return label_image + + +@pytest.fixture +def label_image_with_empty_chunk(): + """Return small label image with an empty chunk for tests. + + dask.array + + array([[ 0, 0, 0, 0, 0, 0], + [111, 111, 0, 0, 0, 0], + [111, 111, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 222, 222, 222], + [ 0, 0, 0, 0, 0, 0]]) + """ + label_image = np.zeros((6, 6)).astype(int) + label_image[1:3,0:2] = 111 + label_image[4,3:] = 222 + label_image = da.from_array(label_image, chunks=(3, 3)) + return label_image + + +def test_find_objects(label_image): + result = dask_image.ndmeasure.find_objects(label_image) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, + 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}} + ) + assert computed_result.equals(expected) + + +def test_3d_find_objects(label_image): + label_image = da.stack([label_image, label_image], axis=2) + result = dask_image.ndmeasure.find_objects(label_image) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, + 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}, + 2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}} + ) + assert computed_result.equals(expected) + + +def test_find_objects_with_empty_chunks(label_image_with_empty_chunk): + result = dask_image.ndmeasure.find_objects(label_image_with_empty_chunk) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3, None), 222: slice(4, 5, None)}, + 1: {111: slice(0, 2, None), 222: slice(3, 6, None)}} + ) + assert computed_result.equals(expected)