Merge pull request #2478 from astrofrog/random-sampling-histogram

Add the ability to specify a random subset size for computing histogram
glue-viz · Apr 16, 2024 · b9ca12f · b9ca12f
2 parents 58fc3a3 + 859ac5c
commit b9ca12f
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 17 deletions.
diff --git a/glue/core/component.py b/glue/core/component.py
@@ -164,7 +164,7 @@ def autotyped(cls, data, units=None):
         if data.dtype.kind == 'M':
             return DateTimeComponent(data)
 
-        n = coerce_numeric(data.ravel()).reshape(data.shape)
+        n = coerce_numeric(data)
 
         thresh = 0.5
         try:

diff --git a/glue/core/data.py b/glue/core/data.py
@@ -29,7 +29,8 @@
 from glue.config import settings, data_translator, subset_state_translator
 from glue.utils import (compute_statistic, unbroadcast, iterate_chunks,
                         datetime64_to_mpl, categorical_ndarray,
-                        format_choices, random_views_for_dask_array)
+                        format_choices, random_views_for_dask_array,
+                        random_indices_for_array)
 from glue.core.coordinate_helpers import axis_label
 
 
@@ -518,7 +519,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
         raise NotImplementedError()
 
     @abc.abstractmethod
-    def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
+    def compute_histogram(self, cids, weights=None, range=None, bins=None,
+                          log=None, subset_state=None, random_subset=None):
         """
         Compute an n-dimensional histogram with regularly spaced bins.
 
@@ -537,6 +539,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
         subset_state : :class:`~glue.core.subset.SubsetState`, optional
             If specified, the histogram will only take into account values in
             the subset state.
+        random_subset : `int`, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic.
         """
         raise NotImplementedError()
 
@@ -1845,11 +1850,11 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
                 if mask is not None:
                     mask = da.hstack([mask[slices].ravel() for slices in random_subset_indices_dask[1]])
             else:
-                if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
-                    self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
-                data = data.ravel(order="K")[self._random_subset_indices[1]]
+                if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.shape:
+                    self._random_subset_indices = (data.shape, random_indices_for_array(data, random_subset))
+                data = data[self._random_subset_indices[1]]
                 if mask is not None:
-                    mask = mask.ravel(order="K")[self._random_subset_indices[1]]
+                    mask = mask[self._random_subset_indices[1]]
 
         result = compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
                                    positive=positive, percentile=percentile)
@@ -1877,7 +1882,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             full_result[result_slices] = result
             return full_result
 
-    def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
+    def compute_histogram(self, cids, weights=None, range=None, bins=None,
+                          log=None, subset_state=None, random_subset=None):
         """
         Compute an n-dimensional histogram with regularly spaced bins.
 
@@ -1898,6 +1904,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
         subset_state : :class:`~glue.core.subset.SubsetState`, optional
             If specified, the histogram will only take into account values in
             the subset state.
+        random_subset : `int`, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic.
         """
 
         if len(cids) > 2:
@@ -1947,6 +1956,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
                 else:
                     w = w[mask]
 
+        # TODO: avoid duplication of the following code block with compute_statistic
+
+        if random_subset and x.size > random_subset:
+
+            original_size = x.size
+
+            if DASK_INSTALLED and isinstance(x, da.Array):
+                # We shouldn't cache _random_subset_indices_dask here because
+                # it might be different for different dask arrays
+                random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10))
+                x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]])
+                if ndim > 1:
+                    y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]])
+                if w is not None:
+                    w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]])
+            else:
+                if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.shape:
+                    self._random_subset_histogram_indices = (x.shape, random_indices_for_array(x, random_subset))
+                x = x[self._random_subset_histogram_indices[1]]
+                if ndim > 1:
+                    y = y[self._random_subset_histogram_indices[1]]
+                if w is not None:
+                    w = w[self._random_subset_histogram_indices[1]]
+
+            # Determine correction factor by which to scale the histogram so
+            # that it still has the right order of magnitude
+            correction = original_size / x.size
+
+        else:
+
+            correction = 1.
+
         if ndim == 1:
             xmin, xmax = range[0]
             xmin, xmax = sorted((xmin, xmax))
@@ -2020,10 +2061,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
 
         if ndim == 1:
             range = (xmin, xmax)
-            return histogram1d(x, range=range, bins=bins[0], weights=w)
+            return histogram1d(x, range=range, bins=bins[0], weights=w) * correction
         elif ndim > 1:
             range = [(xmin, xmax), (ymin, ymax)]
-            return histogram2d(x, y, range=range, bins=bins, weights=w)
+            return histogram2d(x, y, range=range, bins=bins, weights=w) * correction
 
     def compute_fixed_resolution_buffer(self, *args, **kwargs):
         from .fixed_resolution_buffer import compute_fixed_resolution_buffer

diff --git a/glue/core/tests/test_data.py b/glue/core/tests/test_data.py
@@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed():
 
     result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5)
     assert_allclose(result, [[0, 1, 0], [0, 2, 2]])
+
+
+def test_compute_histogram_random_subset():
+
+    # Make sure that compute_histogram works with the random_subset option
+
+    with NumpyRNGContext(12345):
+
+        data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2))
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
+        assert_allclose(result, [178437., 230277., 230631., 230821., 178410.])
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000)
+        assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592])
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000)
+        assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616])
+
+        # Also check the 2D case and the case with weights. Note that the values
+        # now change compared to above because the random indices got reset when
+        # computing the subset. In a way this is a check that this is happening
+        # correctly.
+
+        result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000)
+        assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])
+
+        result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000)
+        assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])
+
+
+def test_compute_histogram_random_subset_dask():
+
+    # Make sure that compute_histogram works with the random_subset option (dask mode)
+
+    da = pytest.importorskip('dask.array')
+
+    data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)),
+                y=da.ones(1024 ** 2).rechunk((1024,)))
+
+    result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
+    assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=20_000)
+
+    result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000)
+    assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=20_000)
+
+    # Also check the 2D case and the case with weights.
+
+    result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000)
+    assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)
+
+    result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000)
+    assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)
diff --git a/glue/utils/array.py b/glue/utils/array.py
@@ -12,7 +12,8 @@
            'coerce_numeric', 'check_sorted', 'unbroadcast',
            'iterate_chunks', 'combine_slices', 'format_minimal', 'compute_statistic',
            'categorical_ndarray', 'index_lookup', 'ensure_numerical',
-           'broadcast_arrays_minimal', 'random_views_for_dask_array']
+           'broadcast_arrays_minimal', 'random_views_for_dask_array',
+           'random_indices_for_array']
 
 
 def unbroadcast(array):
@@ -150,10 +151,8 @@ def coerce_numeric(arr):
         return arr.astype(int)
 
     # a string dtype, or anything else
-    try:
-        return pd.to_numeric(arr, errors='coerce')
-    except AttributeError:  # pandas < 0.19
-        return pd.Series(arr).convert_objects(convert_numeric=True).values
+    original_shape = arr.shape
+    return pd.to_numeric(arr.reshape((arr.size)), errors='coerce').reshape(original_shape)
 
 
 def check_sorted(array):
@@ -632,3 +631,11 @@ def random_views_for_dask_array(array, n_random_samples, n_chunks):
         all_slices.append(tuple(slices))
 
     return all_slices
+
+
+def random_indices_for_array(array, n_random_samples):
+    """
+    Return a tuple of arrays that can be used to index the input
+    array to obtain a random sample of values.
+    """
+    return tuple(np.random.randint(0, size, n_random_samples) for size in array.shape)
diff --git a/glue/viewers/histogram/layer_artist.py b/glue/viewers/histogram/layer_artist.py
@@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs):
         # of updated properties is up to date after this method has been called.
         changed = self.pop_changed_properties()
 
-        if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')):
+        if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')):
             self._calculate_histogram(reset=force)
 
         if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')):

diff --git a/glue/viewers/histogram/state.py b/glue/viewers/histogram/state.py
@@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState):
 
     update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits")
 
+    random_subset = DDCProperty(None, docstring='The maximum number of elements to use '
+                                                'when computing the histogram. If the data '
+                                                'is larger than this, a random subset of '
+                                                'the data will be used.')
+
     def __init__(self, **kwargs):
 
         super(HistogramViewerState, self).__init__()
@@ -260,7 +265,8 @@ def update_histogram(self):
                                              range=[range],
                                              bins=[self._viewer_state.hist_n_bin],
                                              log=[self._viewer_state.x_log],
-                                             subset_state=subset_state)
+                                             subset_state=subset_state,
+                                             random_subset=self._viewer_state.random_subset)
 
         # TODO: determine whether this belongs here or in the layer artist
         if isinstance(range[0], np.datetime64):