Skip to content

Commit

Permalink
Add the ability to specify a random subset size for computing histogr…
Browse files Browse the repository at this point in the history
…ams, and expose as a viewer state property in the histogram viewer
  • Loading branch information
astrofrog committed Mar 19, 2024
1 parent 1bbd6e8 commit 4e44393
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 6 deletions.
48 changes: 44 additions & 4 deletions glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
raise NotImplementedError()

@abc.abstractmethod
def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.
Expand All @@ -537,6 +538,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""
raise NotImplementedError()

Expand Down Expand Up @@ -1877,7 +1881,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
full_result[result_slices] = result
return full_result

def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
def compute_histogram(self, cids, weights=None, range=None, bins=None,
log=None, subset_state=None, random_subset=None):
"""
Compute an n-dimensional histogram with regularly spaced bins.
Expand All @@ -1898,6 +1903,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
subset_state : :class:`~glue.core.subset.SubsetState`, optional
If specified, the histogram will only take into account values in
the subset state.
random_subset : `int`, optional
If specified, this should be an integer giving the number of values
to use for the statistic.
"""

if len(cids) > 2:
Expand Down Expand Up @@ -1947,6 +1955,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
else:
w = w[mask]

# TODO: avoid duplication of the following code block with compute_statistic

if random_subset and x.size > random_subset:

original_size = x.size

if DASK_INSTALLED and isinstance(x, da.Array):
# We shouldn't cache _random_subset_indices_dask here because
# it might be different for different dask arrays
random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10))
x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]])
if ndim > 1:
y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]])
if w is not None:
w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]])
else:
if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.size:
self._random_subset_histogram_indices = (x.size, np.random.randint(0, x.size, random_subset))
x = x.ravel(order="K")[self._random_subset_histogram_indices[1]]
if ndim > 1:
y = y.ravel(order="K")[self._random_subset_histogram_indices[1]]
if w is not None:
w = w.ravel(order="K")[self._random_subset_histogram_indices[1]]

# Determine correction factor by which to scale the histogram so
# that it still has the right order of magnitude
correction = original_size / x.size

else:

correction = 1.

if ndim == 1:
xmin, xmax = range[0]
xmin, xmax = sorted((xmin, xmax))
Expand Down Expand Up @@ -2020,10 +2060,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,

if ndim == 1:
range = (xmin, xmax)
return histogram1d(x, range=range, bins=bins[0], weights=w)
return histogram1d(x, range=range, bins=bins[0], weights=w) * correction
elif ndim > 1:
range = [(xmin, xmax), (ymin, ymax)]
return histogram2d(x, y, range=range, bins=bins, weights=w)
return histogram2d(x, y, range=range, bins=bins, weights=w) * correction

def compute_fixed_resolution_buffer(self, *args, **kwargs):
from .fixed_resolution_buffer import compute_fixed_resolution_buffer
Expand Down
53 changes: 53 additions & 0 deletions glue/core/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed():

result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5)
assert_allclose(result, [[0, 1, 0], [0, 2, 2]])


def test_compute_histogram_random_subset():

# Make sure that compute_histogram works with the random_subset option

with NumpyRNGContext(12345):

data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2))

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178437., 230277., 230631., 230821., 178410.])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000)
assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592])

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000)
assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616])

# Also check the 2D case and the case with weights. Note that the values
# now change compared to above because the random indices got reset when
# computing the subset. In a way this is a check that this is happening
# correctly.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000)
assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])


def test_compute_histogram_random_subset_dask():

# Make sure that compute_histogram works with the random_subset option (dask mode)

da = pytest.importorskip('dask.array')

data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)),
y=da.ones(1024 ** 2).rechunk((1024,)))

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=10_000)

result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000)
assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=10_000)

# Also check the 2D case and the case with weights.

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000)

result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000)
assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000)
2 changes: 1 addition & 1 deletion glue/viewers/histogram/layer_artist.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs):
# of updated properties is up to date after this method has been called.
changed = self.pop_changed_properties()

if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')):
if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')):
self._calculate_histogram(reset=force)

if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')):
Expand Down
8 changes: 7 additions & 1 deletion glue/viewers/histogram/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState):

update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits")

random_subset = DDCProperty(None, docstring='The maximum number of elements to use '
'when computing the histogram. If the data '
'is larger than this, a random subset of '
'the data will be used.')

def __init__(self, **kwargs):

super(HistogramViewerState, self).__init__()
Expand Down Expand Up @@ -260,7 +265,8 @@ def update_histogram(self):
range=[range],
bins=[self._viewer_state.hist_n_bin],
log=[self._viewer_state.x_log],
subset_state=subset_state)
subset_state=subset_state,
random_subset=self._viewer_state.random_subset)

# TODO: determine whether this belongs here or in the layer artist
if isinstance(range[0], np.datetime64):
Expand Down

0 comments on commit 4e44393

Please sign in to comment.