diff --git a/biggus/__init__.py b/biggus/__init__.py index 71456b6..79c5690 100644 --- a/biggus/__init__.py +++ b/biggus/__init__.py @@ -1335,6 +1335,11 @@ def process_data(self, data): class _MeanStreamsHandler(_AggregationStreamsHandler): + def __init__(self, array, axis, mdtol): + # The mdtol argument is not applicable to non-masked arrays + # so it is ignored. + super(_MeanStreamsHandler, self).__init__(array, axis) + def bootstrap(self, shape): self.running_total = np.zeros(shape, dtype=self.array.dtype) @@ -1351,8 +1356,13 @@ def process_data(self, data): class _MeanMaskedStreamsHandler(_AggregationStreamsHandler): + def __init__(self, array, axis, mdtol): + self._mdtol = mdtol + super(_MeanMaskedStreamsHandler, self).__init__(array, axis) + def bootstrap(self, shape): self.running_count = np.zeros(shape, dtype=self.array.dtype) + self.running_masked_count = np.zeros(shape, dtype=self.array.dtype) self.running_total = np.zeros(shape, dtype=self.array.dtype) def finalise(self): @@ -1363,11 +1373,19 @@ def finalise(self): # Promote array-scalar to 0-dimensional array. if array.ndim == 0: array = np.ma.array(array) + # Apply masked/missing data threshold (mdtol). + if self._mdtol < 1: + mask_update = np.true_divide(self.running_masked_count, + self.running_masked_count + + self.running_count) > self._mdtol + array.mask |= mask_update + chunk = Chunk(self.current_keys, array) return chunk def process_data(self, data): self.running_count += np.ma.count(data, axis=self.axis) + self.running_masked_count += np.ma.count_masked(data, axis=self.axis) self.running_total += np.sum(data, axis=self.axis) @@ -1569,7 +1587,7 @@ def _normalise_axis(axis, array): return axes -def mean(a, axis=None): +def mean(a, axis=None, mdtol=1): """ Request the mean of an Array over any number of axes. @@ -1583,6 +1601,14 @@ def mean(a, axis=None): If axis is a tuple of ints, the operation is performed over multiple axes. :type axis: None, or int, or iterable of ints. + :param float mdtol: Tolerance of missing data. The value in each + element of the resulting array will be masked if the + fraction of masked data contributing to that element + exceeds mdtol. mdtol=0 means no missing data is + tolerated while mdtol=1 will mean the resulting + element will be masked if and only if all the + contributing elements of the source array are masked. + Defaults to 1. :return: The Array representing the requested mean. :rtype: Array @@ -1590,9 +1616,10 @@ def mean(a, axis=None): axes = _normalise_axis(axis, a) assert axes is not None and len(axes) == 1 dtype = (np.array([0], dtype=a.dtype) / 1.).dtype + kwargs = dict(mdtol=mdtol) return _Aggregation(a, axes[0], _MeanStreamsHandler, _MeanMaskedStreamsHandler, - dtype, {}) + dtype, kwargs) def std(a, axis=None, ddof=0): diff --git a/biggus/tests/test_aggregation.py b/biggus/tests/test_aggregation.py index 2f9d083..56a05b1 100644 --- a/biggus/tests/test_aggregation.py +++ b/biggus/tests/test_aggregation.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2013, Met Office +# (C) British Crown Copyright 2013 - 2014, Met Office # # This file is part of Biggus. # @@ -17,6 +17,8 @@ import unittest import numpy as np +import numpy.ma +import numpy.testing import biggus from biggus.tests import AccessCounter @@ -95,6 +97,81 @@ def test_add_non_supported_type(self): biggus.mean(range(10), axis=0) +class TestMdtol(unittest.TestCase): + def setUp(self): + self.data = np.ma.arange(12).reshape(3, 4) + self.data[2, 1:] = np.ma.masked + + def _test_mean_with_mdtol(self, data, axis, numpy_result, mdtol=None): + data = AccessCounter(data) + array = biggus.NumpyArrayAdapter(data) + + # Perform aggregation. + if mdtol is None: + # Allow testing of default when mdtol is None. + biggus_aggregation = biggus.mean(array, axis=axis) + else: + biggus_aggregation = biggus.mean(array, axis=axis, mdtol=mdtol) + + # Check the aggregation operation doesn't actually read any data. + self.assertTrue((data.counts == 0).all()) + + # Check results. + biggus_result = biggus_aggregation.masked_array() + # Check resolving `op_array` to a NumPy array only reads + # each relevant source value once. + self.assertTrue((data.counts <= 1).all()) + numpy_mask = np.ma.getmaskarray(numpy_result) + biggus_mask = np.ma.getmaskarray(biggus_result) + np.testing.assert_array_equal(biggus_mask, numpy_mask) + np.testing.assert_array_equal(biggus_result[~biggus_mask].data, + numpy_result[~numpy_mask].data) + + def test_mean_mdtol_default(self): + axis = 0 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected) + + def test_mean_mdtol_one(self): + axis = 0 + mdtol = 1 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_zero(self): + axis = 0 + mdtol = 0 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, True, True, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_below_axis_zero(self): + axis = 0 + mdtol = 0.32 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, True, True, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_above_axis_zero(self): + axis = 0 + mdtol = 0.34 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_below_axis_one(self): + axis = 1 + mdtol = 0.74 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, False, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_above_axis_one(self): + axis = 1 + mdtol = 0.76 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + class TestFlow(unittest.TestCase): def _test_flow(self, axis): data = np.arange(3 * 4 * 5, dtype='f4').reshape(3, 4, 5)