Add the ability to specify a random subset size for computing histogr…

…ams, and expose as a viewer state property in the histogram viewer
glue-viz · Mar 19, 2024 · 4e44393 · 4e44393
1 parent 1bbd6e8
commit 4e44393
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 6 deletions.
diff --git a/glue/core/data.py b/glue/core/data.py
@@ -518,7 +518,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
         raise NotImplementedError()
 
     @abc.abstractmethod
-    def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
+    def compute_histogram(self, cids, weights=None, range=None, bins=None,
+                          log=None, subset_state=None, random_subset=None):
         """
         Compute an n-dimensional histogram with regularly spaced bins.
 
@@ -537,6 +538,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
         subset_state : :class:`~glue.core.subset.SubsetState`, optional
             If specified, the histogram will only take into account values in
             the subset state.
+        random_subset : `int`, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic.
         """
         raise NotImplementedError()
 
@@ -1877,7 +1881,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
             full_result[result_slices] = result
             return full_result
 
-    def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None, subset_state=None):
+    def compute_histogram(self, cids, weights=None, range=None, bins=None,
+                          log=None, subset_state=None, random_subset=None):
         """
         Compute an n-dimensional histogram with regularly spaced bins.
 
@@ -1898,6 +1903,9 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
         subset_state : :class:`~glue.core.subset.SubsetState`, optional
             If specified, the histogram will only take into account values in
             the subset state.
+        random_subset : `int`, optional
+            If specified, this should be an integer giving the number of values
+            to use for the statistic.
         """
 
         if len(cids) > 2:
@@ -1947,6 +1955,38 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
                 else:
                     w = w[mask]
 
+        # TODO: avoid duplication of the following code block with compute_statistic
+
+        if random_subset and x.size > random_subset:
+
+            original_size = x.size
+
+            if DASK_INSTALLED and isinstance(x, da.Array):
+                # We shouldn't cache _random_subset_indices_dask here because
+                # it might be different for different dask arrays
+                random_subset_indices_dask = (x.size, random_views_for_dask_array(x, random_subset, n_chunks=10))
+                x = da.hstack([x[slices].ravel() for slices in random_subset_indices_dask[1]])
+                if ndim > 1:
+                    y = da.hstack([y[slices].ravel() for slices in random_subset_indices_dask[1]])
+                if w is not None:
+                    w = da.hstack([w[slices].ravel() for slices in random_subset_indices_dask[1]])
+            else:
+                if not hasattr(self, '_random_subset_histogram_indices') or self._random_subset_histogram_indices[0] != x.size:
+                    self._random_subset_histogram_indices = (x.size, np.random.randint(0, x.size, random_subset))
+                x = x.ravel(order="K")[self._random_subset_histogram_indices[1]]
+                if ndim > 1:
+                    y = y.ravel(order="K")[self._random_subset_histogram_indices[1]]
+                if w is not None:
+                    w = w.ravel(order="K")[self._random_subset_histogram_indices[1]]
+
+            # Determine correction factor by which to scale the histogram so
+            # that it still has the right order of magnitude
+            correction = original_size / x.size
+
+        else:
+
+            correction = 1.
+
         if ndim == 1:
             xmin, xmax = range[0]
             xmin, xmax = sorted((xmin, xmax))
@@ -2020,10 +2060,10 @@ def compute_histogram(self, cids, weights=None, range=None, bins=None, log=None,
 
         if ndim == 1:
             range = (xmin, xmax)
-            return histogram1d(x, range=range, bins=bins[0], weights=w)
+            return histogram1d(x, range=range, bins=bins[0], weights=w) * correction
         elif ndim > 1:
             range = [(xmin, xmax), (ymin, ymax)]
-            return histogram2d(x, y, range=range, bins=bins, weights=w)
+            return histogram2d(x, y, range=range, bins=bins, weights=w) * correction
 
     def compute_fixed_resolution_buffer(self, *args, **kwargs):
         from .fixed_resolution_buffer import compute_fixed_resolution_buffer

diff --git a/glue/core/tests/test_data.py b/glue/core/tests/test_data.py
@@ -1132,3 +1132,56 @@ def test_compute_histogram_dask_mixed():
 
     result = data.compute_histogram([data.id['y'], data.id['x']], range=[[-0.5, 11.25], [-0.5, 11.25]], bins=[2, 3], subset_state=data.id['x'] > 4.5)
     assert_allclose(result, [[0, 1, 0], [0, 2, 2]])
+
+
+def test_compute_histogram_random_subset():
+
+    # Make sure that compute_histogram works with the random_subset option
+
+    with NumpyRNGContext(12345):
+
+        data = Data(x=np.random.uniform(0, 10, 1024**2), y=np.ones(1024 ** 2))
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
+        assert_allclose(result, [178437., 230277., 230631., 230821., 178410.])
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000)
+        assert_allclose(result, [171756.7488, 239180.1856, 231420.7232, 228799.2832, 177419.0592])
+
+        result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=10_000)
+        assert_allclose(result, [0., 0., 168148.277, 228024.0614, 180665.6616])
+
+        # Also check the 2D case and the case with weights. Note that the values
+        # now change compared to above because the random indices got reset when
+        # computing the subset. In a way this is a check that this is happening
+        # correctly.
+
+        result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=10_000)
+        assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])
+
+        result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=10_000)
+        assert_allclose(result[:, 0], [175741.3376, 230477.0048, 234881.024, 231525.5808, 175951.0528])
+
+
+def test_compute_histogram_random_subset_dask():
+
+    # Make sure that compute_histogram works with the random_subset option (dask mode)
+
+    da = pytest.importorskip('dask.array')
+
+    data = Data(x=da.random.uniform(0, 10, 1024**2).rechunk((1024,)),
+                y=da.ones(1024 ** 2).rechunk((1024,)))
+
+    result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], random_subset=10_000_000)
+    assert_allclose(result, [178535., 230694., 229997., 231215., 178135.], atol=10_000)
+
+    result = data.compute_histogram([data.id['x']], range=[[-0.5, 10.5]], bins=[5], subset_state=data.id['x'] > 4.5, random_subset=100_000)
+    assert_allclose(result, [0., 0., 168079., 230673., 178745.], atol=10_000)
+
+    # Also check the 2D case and the case with weights.
+
+    result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], random_subset=100_000)
+    assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000)
+
+    result = data.compute_histogram([data.id['x'], data.id['y']], range=[[-0.5, 10.5], [-0.5, 1.5]], bins=[5, 1], weights=data.id['y'], random_subset=100_000)
+    assert_allclose(result[:, 0], [178535., 230694., 229997., 231215., 178135.], atol=10_000)
diff --git a/glue/viewers/histogram/layer_artist.py b/glue/viewers/histogram/layer_artist.py
@@ -167,7 +167,7 @@ def _update_histogram(self, force=False, **kwargs):
         # of updated properties is up to date after this method has been called.
         changed = self.pop_changed_properties()
 
-        if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative')):
+        if force or any(prop in changed for prop in ('layer', 'x_att', 'hist_x_min', 'hist_x_max', 'hist_n_bin', 'x_log', 'y_log', 'normalize', 'cumulative', 'random_subset')):
             self._calculate_histogram(reset=force)
 
         if force or any(prop in changed for prop in ('alpha', 'color', 'zorder', 'visible')):

diff --git a/glue/viewers/histogram/state.py b/glue/viewers/histogram/state.py
@@ -42,6 +42,11 @@ class HistogramViewerState(MatplotlibDataViewerState):
 
     update_bins_on_reset_limits = DDCProperty(True, docstring="Whether to update the bins to match the view when resetting limits")
 
+    random_subset = DDCProperty(None, docstring='The maximum number of elements to use '
+                                                'when computing the histogram. If the data '
+                                                'is larger than this, a random subset of '
+                                                'the data will be used.')
+
     def __init__(self, **kwargs):
 
         super(HistogramViewerState, self).__init__()
@@ -260,7 +265,8 @@ def update_histogram(self):
                                              range=[range],
                                              bins=[self._viewer_state.hist_n_bin],
                                              log=[self._viewer_state.x_log],
-                                             subset_state=subset_state)
+                                             subset_state=subset_state,
+                                             random_subset=self._viewer_state.random_subset)
 
         # TODO: determine whether this belongs here or in the layer artist
         if isinstance(range[0], np.datetime64):