Skip to content

Commit

Permalink
Merge pull request #2478 from astrofrog/random-sampling-histogram
Browse files Browse the repository at this point in the history
Add the ability to specify a random subset size for computing histogram
  • Loading branch information
astrofrog authored Apr 16, 2024
2 parents 58fc3a3 + 859ac5c commit b9ca12f
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 17 deletions.
2 changes: 1 addition & 1 deletion glue/core/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def autotyped(cls, data, units=None):
if data.dtype.kind == 'M':
return DateTimeComponent(data)

n = coerce_numeric(data.ravel()).reshape(data.shape)
n = coerce_numeric(data)

thresh = 0.5
try:
Expand Down
59 changes: 50 additions & 9 deletions glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from glue.config import settings, data_translator, subset_state_translator
from glue.utils import (compute_statistic, unbroadcast, iterate_chunks,
datetime64_to_mpl, categorical_ndarray,
format_choices, random_views_for_dask_array)
format_choices, random_views_for_dask_array,
random_indices_for_array)
from glue.core.coordinate_helpers import axis_label


Expand Down Expand Up @@ -518,7 +519,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
raise NotImplementedError()

@abc.abstractmethod
def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.
Expand All @@ -537,6 +539,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""
raise NotImplementedError()

Expand Down Expand Up @@ -1845,11 +1850,11 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
if mask is not None:
mask = da.hstack([mask[slices].ravel() for slices in random_subset_indices_dask[1]])
else:
if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
data = data.ravel(order="K")[self._random_subset_indices[1]]
if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.shape:
self._random_subset_indices = (data.shape, random_indices_for_array(data, random_subset))
data = data[self._random_subset_indices[1]]
if mask is not None:
mask = mask.ravel(order="K")[self._random_subset_indices[1]]
mask = mask[self._random_subset_indices[1]]

result = compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
positive=positive, percentile=percentile)
Expand Down Expand Up @@ -1877,7 +1882,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
full_result[result_slices] = result
return full_result

def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.
Expand All @@ -1898,6 +1904,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""

if len(cids) > 2:
Expand Down Expand Up @@ -1947,6 +1956,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
else:
w = w[mask]

# TODO: avoid duplication of the following code block with compute_statistic

if random_subset and x.size > random_subset:

original_size = x.size

if DASK_INSTALLED and isinstance(x, da.Array):
# We shouldn't cache _random_subset_indices_dask here because
# it might be different for different dask arrays
random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10))
x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]])
if ndim > 1:
y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]])
if w is not None:
w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]])
else:
if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.shape:
self._random_subset_histogram_indices = (x.shape, random_indices_for_array(x, random_subset))
x = x[self._random_subset_histogram_indices[1]]
if ndim > 1:
y = y[self._random_subset_histogram_indices[1]]
if w is not None:
w = w[self._random_subset_histogram_indices[1]]

# Determine correction factor by which to scale the histogram so
# that it still has the right order of magnitude
correction = original_size / x.size

else:

correction = 1.

if ndim == 1:
xmin, xmax = range[0]
xmin, xmax = sorted((xmin, xmax))
Expand Down Expand Up @@ -2020,10 +2061,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,

if ndim == 1:
range = (xmin, xmax)
return histogram1d(x, range=range, bins=bins[0], weights=w)
return histogram1d(x, range=range, bins=bins[0], weights=w) * correction
elif ndim > 1:
range = [(xmin, xmax), (ymin, ymax)]
return histogram2d(x, y, range=range, bins=bins, weights=w)
return histogram2d(x, y, range=range, bins=bins, weights=w) * correction

def compute_fixed_resolution_buffer(self, *args, **kwargs):
from .fixed_resolution_buffer import compute_fixed_resolution_buffer
Expand Down
53 changes: 53 additions & 0 deletions glue/core/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed():

result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5)
assert_allclose(result, [[0, 1, 0], [0, 2, 2]])


def test_compute_histogram_random_subset():

# Make sure that compute_histogram works with the random_subset option

with NumpyRNGContext(12345):

data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2))

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178437., 230277., 230631., 230821., 178410.])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000)
assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000)
assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616])

# Also check the 2D case and the case with weights. Note that the values
# now change compared to above because the random indices got reset when
# computing the subset. In a way this is a check that this is happening
# correctly.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])


def test_compute_histogram_random_subset_dask():

# Make sure that compute_histogram works with the random_subset option (dask mode)

da = pytest.importorskip('dask.array')

data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)),
y=da.ones(1024 ** 2).rechunk((1024,)))

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=20_000)

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000)
assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=20_000)

# Also check the 2D case and the case with weights.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)
17 changes: 12 additions & 5 deletions glue/utils/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
'coerce_numeric', 'check_sorted', 'unbroadcast',
'iterate_chunks', 'combine_slices', 'format_minimal', 'compute_statistic',
'categorical_ndarray', 'index_lookup', 'ensure_numerical',
'broadcast_arrays_minimal', 'random_views_for_dask_array']
'broadcast_arrays_minimal', 'random_views_for_dask_array',
'random_indices_for_array']


def unbroadcast(array):
Expand Down Expand Up @@ -150,10 +151,8 @@ def coerce_numeric(arr):
return arr.astype(int)

# a string dtype, or anything else
try:
return pd.to_numeric(arr, errors='coerce')
except AttributeError: # pandas < 0.19
return pd.Series(arr).convert_objects(convert_numeric=True).values
original_shape = arr.shape
return pd.to_numeric(arr.reshape((arr.size)), errors='coerce').reshape(original_shape)


def check_sorted(array):
Expand Down Expand Up @@ -632,3 +631,11 @@ def random_views_for_dask_array(array, n_random_samples, n_chunks):
all_slices.append(tuple(slices))

return all_slices


def random_indices_for_array(array, n_random_samples):
"""
Return a tuple of arrays that can be used to index the input
array to obtain a random sample of values.
"""
return tuple(np.random.randint(0, size, n_random_samples) for size in array.shape)
2 changes: 1 addition & 1 deletion glue/viewers/histogram/layer_artist.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs):
# of updated properties is up to date after this method has been called.
changed = self.pop_changed_properties()

if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')):
if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')):
self._calculate_histogram(reset=force)

if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')):
Expand Down
8 changes: 7 additions & 1 deletion glue/viewers/histogram/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState):

update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits")

random_subset = DDCProperty(None, docstring='The maximum number of elements to use '
'when computing the histogram. If the data '
'is larger than this, a random subset of '
'the data will be used.')

def __init__(self, **kwargs):

super(HistogramViewerState, self).__init__()
Expand Down Expand Up @@ -260,7 +265,8 @@ def update_histogram(self):
range=[range],
bins=[self._viewer_state.hist_n_bin],
log=[self._viewer_state.x_log],
subset_state=subset_state)
subset_state=subset_state,
random_subset=self._viewer_state.random_subset)

# TODO: determine whether this belongs here or in the layer artist
if isinstance(range[0], np.datetime64):
Expand Down

0 comments on commit b9ca12f

Please sign in to comment.