Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to specify a random subset size for computing histogram #2478

Merged
merged 4 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion glue/core/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def autotyped(cls, data, units=None):
if data.dtype.kind == 'M':
return DateTimeComponent(data)

n = coerce_numeric(data.ravel()).reshape(data.shape)
n = coerce_numeric(data)

thresh = 0.5
try:
Expand Down
59 changes: 50 additions & 9 deletions glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from glue.config import settings, data_translator, subset_state_translator
from glue.utils import (compute_statistic, unbroadcast, iterate_chunks,
datetime64_to_mpl, categorical_ndarray,
format_choices, random_views_for_dask_array)
format_choices, random_views_for_dask_array,
random_indices_for_array)
from glue.core.coordinate_helpers import axis_label


Expand Down Expand Up @@ -518,7 +519,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
raise NotImplementedError()

@abc.abstractmethod
def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.

Expand All @@ -537,6 +539,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""
raise NotImplementedError()

Expand Down Expand Up @@ -1845,11 +1850,11 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
if mask is not None:
mask = da.hstack([mask[slices].ravel() for slices in random_subset_indices_dask[1]])
else:
if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.size:
self._random_subset_indices = (data.size, np.random.randint(0, data.size, random_subset))
data = data.ravel(order="K")[self._random_subset_indices[1]]
if not hasattr(self, '_random_subset_indices') or self._random_subset_indices[0] != data.shape:
self._random_subset_indices = (data.shape, random_indices_for_array(data, random_subset))
data = data[self._random_subset_indices[1]]
if mask is not None:
mask = mask.ravel(order="K")[self._random_subset_indices[1]]
mask = mask[self._random_subset_indices[1]]

result = compute_statistic(statistic, data, mask=mask, axis=axis, finite=finite,
positive=positive, percentile=percentile)
Expand Down Expand Up @@ -1877,7 +1882,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
full_result[result_slices] = result
return full_result

def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.

Expand All @@ -1898,6 +1904,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""

if len(cids) > 2:
Expand Down Expand Up @@ -1947,6 +1956,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
else:
w = w[mask]

# TODO: avoid duplication of the following code block with compute_statistic

if random_subset and x.size > random_subset:

original_size = x.size

if DASK_INSTALLED and isinstance(x, da.Array):
# We shouldn't cache _random_subset_indices_dask here because
# it might be different for different dask arrays
random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10))
x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]])
if ndim > 1:
y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]])
if w is not None:
w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]])
else:
if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.shape:
self._random_subset_histogram_indices = (x.shape, random_indices_for_array(x, random_subset))
x = x[self._random_subset_histogram_indices[1]]
if ndim > 1:
y = y[self._random_subset_histogram_indices[1]]
if w is not None:
w = w[self._random_subset_histogram_indices[1]]

# Determine correction factor by which to scale the histogram so
# that it still has the right order of magnitude
correction = original_size / x.size

else:

correction = 1.

if ndim == 1:
xmin, xmax = range[0]
xmin, xmax = sorted((xmin, xmax))
Expand Down Expand Up @@ -2020,10 +2061,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,

if ndim == 1:
range = (xmin, xmax)
return histogram1d(x, range=range, bins=bins[0], weights=w)
return histogram1d(x, range=range, bins=bins[0], weights=w) * correction
elif ndim > 1:
range = [(xmin, xmax), (ymin, ymax)]
return histogram2d(x, y, range=range, bins=bins, weights=w)
return histogram2d(x, y, range=range, bins=bins, weights=w) * correction

def compute_fixed_resolution_buffer(self, *args, **kwargs):
from .fixed_resolution_buffer import compute_fixed_resolution_buffer
Expand Down
53 changes: 53 additions & 0 deletions glue/core/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed():

result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5)
assert_allclose(result, [[0, 1, 0], [0, 2, 2]])


def test_compute_histogram_random_subset():

# Make sure that compute_histogram works with the random_subset option

with NumpyRNGContext(12345):

data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a test for 2D image like how Jdaviz would store it. We have a 2D array under one component. Here, you store each dimension as a separate component. I don't think it is equivalent?

Since we do not use histogram viewer directly in Jdaviz, I am trying to emulate this calculation directly over at spacetelescope/jdaviz#2763 but without success. There must be some subtlety I am missing. Are you able to have a look over there and tell me if I went down the wrong rabbit hole?

Cami has a 8GB test image if you need one, if she is willing to share with you.


result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178437., 230277., 230631., 230821., 178410.])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000)
assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000)
assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616])

# Also check the 2D case and the case with weights. Note that the values
# now change compared to above because the random indices got reset when
# computing the subset. In a way this is a check that this is happening
# correctly.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])


def test_compute_histogram_random_subset_dask():

# Make sure that compute_histogram works with the random_subset option (dask mode)

da = pytest.importorskip('dask.array')

data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)),
y=da.ones(1024 ** 2).rechunk((1024,)))

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=20_000)

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000)
assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=20_000)

# Also check the 2D case and the case with weights.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=20_000)
17 changes: 12 additions & 5 deletions glue/utils/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
'coerce_numeric', 'check_sorted', 'unbroadcast',
'iterate_chunks', 'combine_slices', 'format_minimal', 'compute_statistic',
'categorical_ndarray', 'index_lookup', 'ensure_numerical',
'broadcast_arrays_minimal', 'random_views_for_dask_array']
'broadcast_arrays_minimal', 'random_views_for_dask_array',
'random_indices_for_array']


def unbroadcast(array):
Expand Down Expand Up @@ -150,10 +151,8 @@ def coerce_numeric(arr):
return arr.astype(int)

# a string dtype, or anything else
try:
return pd.to_numeric(arr, errors='coerce')
except AttributeError: # pandas < 0.19
return pd.Series(arr).convert_objects(convert_numeric=True).values
original_shape = arr.shape
return pd.to_numeric(arr.reshape((arr.size)), errors='coerce').reshape(original_shape)


def check_sorted(array):
Expand Down Expand Up @@ -632,3 +631,11 @@ def random_views_for_dask_array(array, n_random_samples, n_chunks):
all_slices.append(tuple(slices))

return all_slices


def random_indices_for_array(array, n_random_samples):
"""
Return a tuple of arrays that can be used to index the input
array to obtain a random sample of values.
"""
return tuple(np.random.randint(0, size, n_random_samples) for size in array.shape)
2 changes: 1 addition & 1 deletion glue/viewers/histogram/layer_artist.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs):
# of updated properties is up to date after this method has been called.
changed = self.pop_changed_properties()

if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')):
if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')):
self._calculate_histogram(reset=force)

if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')):
Expand Down
8 changes: 7 additions & 1 deletion glue/viewers/histogram/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState):

update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits")

random_subset = DDCProperty(None, docstring='The maximum number of elements to use '
'when computing the histogram. If the data '
'is larger than this, a random subset of '
'the data will be used.')

def __init__(self, **kwargs):

super(HistogramViewerState, self).__init__()
Expand Down Expand Up @@ -260,7 +265,8 @@ def update_histogram(self):
range=[range],
bins=[self._viewer_state.hist_n_bin],
log=[self._viewer_state.x_log],
subset_state=subset_state)
subset_state=subset_state,
random_subset=self._viewer_state.random_subset)

# TODO: determine whether this belongs here or in the layer artist
if isinstance(range[0], np.datetime64):
Expand Down