From 4e44393c08a0a755b709ad5ce2919e6df0a396a2 Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 19 Mar 2024 10:32:27 +0000 Subject: [PATCH 1/4] Add the ability to specify a random subset size for computing histograms, and expose as a viewer state property in the histogram viewer --- glue/core/data.py | 48 +++++++++++++++++++++-- glue/core/tests/test_data.py | 53 ++++++++++++++++++++++++++ glue/viewers/histogram/layer_artist.py | 2 +- glue/viewers/histogram/state.py | 8 +++- 4 files changed, 105 insertions(+), 6 deletions(-) diff --git a/glue/core/data.py b/glue/core/data.py index 6313eace5..9f9b7d1c4 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -518,7 +518,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, raise NotImplementedError() @abc.abstractmethod - def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None): + def compute_histogram(self, cids, weights=None, range=None, bins=None, + log=None, subset_state=None, random_subset=None): """ Compute an n-dimensional histogram with regularly spaced bins. @@ -537,6 +538,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state : :class:`~glue.core.subset.SubsetState`, optional If specified, the histogram will only take into account values in the subset state. + random_subset : `int`, optional + If specified, this should be an integer giving the number of values + to use for the statistic. """ raise NotImplementedError() @@ -1877,7 +1881,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, full_result[result_slices] = result return full_result - def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None): + def compute_histogram(self, cids, weights=None, range=None, bins=None, + log=None, subset_state=None, random_subset=None): """ Compute an n-dimensional histogram with regularly spaced bins. @@ -1898,6 +1903,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state : :class:`~glue.core.subset.SubsetState`, optional If specified, the histogram will only take into account values in the subset state. + random_subset : `int`, optional + If specified, this should be an integer giving the number of values + to use for the statistic. """ if len(cids) > 2: @@ -1947,6 +1955,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, else: w = w[mask] + # TODO: avoid duplication of the following code block with compute_statistic + + if random_subset and x.size > random_subset: + + original_size = x.size + + if DASK_INSTALLED and isinstance(x, da.Array): + # We shouldn't cache _random_subset_indices_dask here because + # it might be different for different dask arrays + random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10)) + x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]]) + if ndim > 1: + y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]]) + if w is not None: + w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]]) + else: + if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.size: + self._random_subset_histogram_indices = (x.size, np.random.randint(0, x.size, random_subset)) + x = x.ravel(order="K")[self._random_subset_histogram_indices[1]] + if ndim > 1: + y = y.ravel(order="K")[self._random_subset_histogram_indices[1]] + if w is not None: + w = w.ravel(order="K")[self._random_subset_histogram_indices[1]] + + # Determine correction factor by which to scale the histogram so + # that it still has the right order of magnitude + correction = original_size / x.size + + else: + + correction = 1. + if ndim == 1: xmin, xmax = range[0] xmin, xmax = sorted((xmin, xmax)) @@ -2020,10 +2060,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, if ndim == 1: range = (xmin, xmax) - return histogram1d(x, range=range, bins=bins[0], weights=w) + return histogram1d(x, range=range, bins=bins[0], weights=w) * correction elif ndim > 1: range = [(xmin, xmax), (ymin, ymax)] - return histogram2d(x, y, range=range, bins=bins, weights=w) + return histogram2d(x, y, range=range, bins=bins, weights=w) * correction def compute_fixed_resolution_buffer(self, *args, **kwargs): from .fixed_resolution_buffer import compute_fixed_resolution_buffer diff --git a/glue/core/tests/test_data.py b/glue/core/tests/test_data.py index b20382da3..8ffb16bfd 100644 --- a/glue/core/tests/test_data.py +++ b/glue/core/tests/test_data.py @@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed(): result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5) assert_allclose(result, [[0, 1, 0], [0, 2, 2]]) + + +def test_compute_histogram_random_subset(): + + # Make sure that compute_histogram works with the random_subset option + + with NumpyRNGContext(12345): + + data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2)) + + result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000) + assert_allclose(result, [178437., 230277., 230631., 230821., 178410.]) + + result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000) + assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592]) + + result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000) + assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616]) + + # Also check the 2D case and the case with weights. Note that the values + # now change compared to above because the random indices got reset when + # computing the subset. In a way this is a check that this is happening + # correctly. + + result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000) + assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528]) + + result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000) + assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528]) + + +def test_compute_histogram_random_subset_dask(): + + # Make sure that compute_histogram works with the random_subset option (dask mode) + + da = pytest.importorskip('dask.array') + + data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)), + y=da.ones(1024 ** 2).rechunk((1024,))) + + result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000) + assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=10_000) + + result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000) + assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=10_000) + + # Also check the 2D case and the case with weights. + + result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000) + assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000) + + result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000) + assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000) diff --git a/glue/viewers/histogram/layer_artist.py b/glue/viewers/histogram/layer_artist.py index 69ef0a22c..0f3960fd2 100644 --- a/glue/viewers/histogram/layer_artist.py +++ b/glue/viewers/histogram/layer_artist.py @@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs): # of updated properties is up to date after this method has been called. changed = self.pop_changed_properties() - if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')): + if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')): self._calculate_histogram(reset=force) if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')): diff --git a/glue/viewers/histogram/state.py b/glue/viewers/histogram/state.py index 5e2b38754..745016484 100644 --- a/glue/viewers/histogram/state.py +++ b/glue/viewers/histogram/state.py @@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState): update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits") + random_subset = DDCProperty(None, docstring='The maximum number of elements to use ' + 'when computing the histogram. If the data ' + 'is larger than this, a random subset of ' + 'the data will be used.') + def __init__(self, **kwargs): super(HistogramViewerState, self).__init__() @@ -260,7 +265,8 @@ def update_histogram(self): range=[range], bins=[self._viewer_state.hist_n_bin], log=[self._viewer_state.x_log], - subset_state=subset_state) + subset_state=subset_state, + random_subset=self._viewer_state.random_subset) # TODO: determine whether this belongs here or in the layer artist if isinstance(range[0], np.datetime64): From ad0fa670572c2db80fbca324ae0f373abe7a75aa Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 19 Mar 2024 10:41:03 +0000 Subject: [PATCH 2/4] Relax tolerance --- glue/core/tests/test_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/glue/core/tests/test_data.py b/glue/core/tests/test_data.py index 8ffb16bfd..baa1aa825 100644 --- a/glue/core/tests/test_data.py +++ b/glue/core/tests/test_data.py @@ -1173,15 +1173,15 @@ def test_compute_histogram_random_subset_dask(): y=da.ones(1024 ** 2).rechunk((1024,))) result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000) - assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=10_000) + assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=20_000) result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000) - assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=10_000) + assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=20_000) # Also check the 2D case and the case with weights. result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000) - assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000) + assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000) result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000) - assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000) + assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000) From ce96be65e62e0eff2a86391548933de4e3c8b176 Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 9 Apr 2024 12:05:53 +0100 Subject: [PATCH 3/4] Avoid calling ravel() in compute_histogram and compute_statistic since for some arrays (e.g. a cutout of a larger image) this causes a copy --- glue/core/component.py | 2 +- glue/core/data.py | 21 +++++++++++---------- glue/utils/array.py | 17 ++++++++++++----- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/glue/core/component.py b/glue/core/component.py index fe7202b46..67a7a6d51 100644 --- a/glue/core/component.py +++ b/glue/core/component.py @@ -164,7 +164,7 @@ def autotyped(cls, data, units=None): if data.dtype.kind == 'M': return DateTimeComponent(data) - n = coerce_numeric(data.ravel()).reshape(data.shape) + n = coerce_numeric(data) thresh = 0.5 try: diff --git a/glue/core/data.py b/glue/core/data.py index 9f9b7d1c4..8d64c8c1a 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -29,7 +29,8 @@ from glue.config import settings, data_translator, subset_state_translator from glue.utils import (compute_statistic, unbroadcast, iterate_chunks, datetime64_to_mpl, categorical_ndarray, - format_choices, random_views_for_dask_array) + format_choices, random_views_for_dask_array, + random_indices_for_array) from glue.core.coordinate_helpers import axis_label @@ -1849,11 +1850,11 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, if mask is not None: mask = da.hstack([mask[slices].ravel() for slices in random_subset_indices_dask[1]]) else: - if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size: - self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset)) - data = data.ravel(order="K")[self._random_subset_indices[1]] + if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.shape: + self._random_subset_indices = (data.shape, random_indices_for_array(data, random_subset)) + data = data[*self._random_subset_indices[1]] if mask is not None: - mask = mask.ravel(order="K")[self._random_subset_indices[1]] + mask = mask[*self._random_subset_indices[1]] result = compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite, positive=positive, percentile=percentile) @@ -1971,13 +1972,13 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, if w is not None: w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]]) else: - if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.size: - self._random_subset_histogram_indices = (x.size, np.random.randint(0, x.size, random_subset)) - x = x.ravel(order="K")[self._random_subset_histogram_indices[1]] + if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.shape: + self._random_subset_histogram_indices = (x.shape, random_indices_for_array(x, random_subset)) + x = x[*self._random_subset_histogram_indices[1]] if ndim > 1: - y = y.ravel(order="K")[self._random_subset_histogram_indices[1]] + y = y[*self._random_subset_histogram_indices[1]] if w is not None: - w = w.ravel(order="K")[self._random_subset_histogram_indices[1]] + w = w[*self._random_subset_histogram_indices[1]] # Determine correction factor by which to scale the histogram so # that it still has the right order of magnitude diff --git a/glue/utils/array.py b/glue/utils/array.py index dabc33307..9213e993f 100644 --- a/glue/utils/array.py +++ b/glue/utils/array.py @@ -12,7 +12,8 @@ 'coerce_numeric', 'check_sorted', 'unbroadcast', 'iterate_chunks', 'combine_slices', 'format_minimal', 'compute_statistic', 'categorical_ndarray', 'index_lookup', 'ensure_numerical', - 'broadcast_arrays_minimal', 'random_views_for_dask_array'] + 'broadcast_arrays_minimal', 'random_views_for_dask_array', + 'random_indices_for_array'] def unbroadcast(array): @@ -150,10 +151,8 @@ def coerce_numeric(arr): return arr.astype(int) # a string dtype, or anything else - try: - return pd.to_numeric(arr, errors='coerce') - except AttributeError: # pandas < 0.19 - return pd.Series(arr).convert_objects(convert_numeric=True).values + original_shape = arr.shape + return pd.to_numeric(arr.reshape((arr.size)), errors='coerce').reshape(original_shape) def check_sorted(array): @@ -632,3 +631,11 @@ def random_views_for_dask_array(array, n_random_samples, n_chunks): all_slices.append(tuple(slices)) return all_slices + + +def random_indices_for_array(array, n_random_samples): + """ + Return a tuple of arrays that can be used to index the input + array to obtain a random sample of values. + """ + return tuple(np.random.randint(0, size, n_random_samples) for size in array.shape) From 859ac5c37618bda47f51f5953f8cc6f75d3228eb Mon Sep 17 00:00:00 2001 From: Thomas Robitaille Date: Tue, 9 Apr 2024 12:16:26 +0100 Subject: [PATCH 4/4] Fix syntax for Python<3.11 --- glue/core/data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/glue/core/data.py b/glue/core/data.py index 8d64c8c1a..7bcb9fe4f 100644 --- a/glue/core/data.py +++ b/glue/core/data.py @@ -1852,9 +1852,9 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None, else: if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.shape: self._random_subset_indices = (data.shape, random_indices_for_array(data, random_subset)) - data = data[*self._random_subset_indices[1]] + data = data[self._random_subset_indices[1]] if mask is not None: - mask = mask[*self._random_subset_indices[1]] + mask = mask[self._random_subset_indices[1]] result = compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite, positive=positive, percentile=percentile) @@ -1974,11 +1974,11 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, else: if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.shape: self._random_subset_histogram_indices = (x.shape, random_indices_for_array(x, random_subset)) - x = x[*self._random_subset_histogram_indices[1]] + x = x[self._random_subset_histogram_indices[1]] if ndim > 1: - y = y[*self._random_subset_histogram_indices[1]] + y = y[self._random_subset_histogram_indices[1]] if w is not None: - w = w[*self._random_subset_histogram_indices[1]] + w = w[self._random_subset_histogram_indices[1]] # Determine correction factor by which to scale the histogram so # that it still has the right order of magnitude