From f73fb5f461c4d4899d31cf75f30770ad7bcadf7e Mon Sep 17 00:00:00 2001 From: Ed Campbell Date: Wed, 11 Jun 2014 17:05:18 +0100 Subject: [PATCH 1/3] Added mdtol support for biggus.mean() --- biggus/__init__.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/biggus/__init__.py b/biggus/__init__.py index 71456b6..70d06b2 100644 --- a/biggus/__init__.py +++ b/biggus/__init__.py @@ -1335,6 +1335,11 @@ def process_data(self, data): class _MeanStreamsHandler(_AggregationStreamsHandler): + def __init__(self, array, axis, mdtol=1): + # The mdtol kwarg not applicable to non-masked arrays + # so it is ignored. + super(_MeanStreamsHandler, self).__init__(array, axis) + def bootstrap(self, shape): self.running_total = np.zeros(shape, dtype=self.array.dtype) @@ -1351,8 +1356,13 @@ def process_data(self, data): class _MeanMaskedStreamsHandler(_AggregationStreamsHandler): + def __init__(self, array, axis, mdtol=1): + self._mdtol = mdtol + super(_MeanMaskedStreamsHandler, self).__init__(array, axis) + def bootstrap(self, shape): self.running_count = np.zeros(shape, dtype=self.array.dtype) + self.running_masked_count = np.zeros(shape, dtype=self.array.dtype) self.running_total = np.zeros(shape, dtype=self.array.dtype) def finalise(self): @@ -1363,11 +1373,19 @@ def finalise(self): # Promote array-scalar to 0-dimensional array. if array.ndim == 0: array = np.ma.array(array) + # Apply masked/missing data threshold (mdtol). + if self._mdtol < 1: + mask_update = np.true_divide(self.running_masked_count, + self.running_masked_count + + self.running_count) > self._mdtol + array.mask |= mask_update + chunk = Chunk(self.current_keys, array) return chunk def process_data(self, data): self.running_count += np.ma.count(data, axis=self.axis) + self.running_masked_count += np.ma.count_masked(data, axis=self.axis) self.running_total += np.sum(data, axis=self.axis) @@ -1569,7 +1587,7 @@ def _normalise_axis(axis, array): return axes -def mean(a, axis=None): +def mean(a, axis=None, mdtol=1): """ Request the mean of an Array over any number of axes. @@ -1583,16 +1601,26 @@ def mean(a, axis=None): If axis is a tuple of ints, the operation is performed over multiple axes. :type axis: None, or int, or iterable of ints. + :param float mdtol: Tolerance of missing data. The value in each + element of the resulting array will be masked if the + fraction of masked data contributing to that element + exceeds mdtol. mdtol=0 means no missing data is + tolerated while mdtol=1 will mean the resulting + element will be masked if and only if all the + contributing elements of the source array are masked. + Defaults to 1. :return: The Array representing the requested mean. :rtype: Array """ + import pdb;pdb.set_trace() axes = _normalise_axis(axis, a) assert axes is not None and len(axes) == 1 dtype = (np.array([0], dtype=a.dtype) / 1.).dtype + kwargs = dict(mdtol=mdtol) return _Aggregation(a, axes[0], _MeanStreamsHandler, _MeanMaskedStreamsHandler, - dtype, {}) + dtype, kwargs) def std(a, axis=None, ddof=0): From 00439ad05548228a6c31978fbb4afc68c380ae0e Mon Sep 17 00:00:00 2001 From: Ed Campbell Date: Thu, 12 Jun 2014 14:43:07 +0100 Subject: [PATCH 2/3] Removed default val for mdtol from Handlers --- biggus/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/biggus/__init__.py b/biggus/__init__.py index 70d06b2..9cf8410 100644 --- a/biggus/__init__.py +++ b/biggus/__init__.py @@ -1335,8 +1335,8 @@ def process_data(self, data): class _MeanStreamsHandler(_AggregationStreamsHandler): - def __init__(self, array, axis, mdtol=1): - # The mdtol kwarg not applicable to non-masked arrays + def __init__(self, array, axis, mdtol): + # The mdtol argument is not applicable to non-masked arrays # so it is ignored. super(_MeanStreamsHandler, self).__init__(array, axis) @@ -1356,7 +1356,7 @@ def process_data(self, data): class _MeanMaskedStreamsHandler(_AggregationStreamsHandler): - def __init__(self, array, axis, mdtol=1): + def __init__(self, array, axis, mdtol): self._mdtol = mdtol super(_MeanMaskedStreamsHandler, self).__init__(array, axis) From b1666903449473f7010b29774d6e241979b2121f Mon Sep 17 00:00:00 2001 From: Ed Campbell Date: Thu, 12 Jun 2014 16:00:14 +0100 Subject: [PATCH 3/3] tests for mdtol --- biggus/__init__.py | 1 - biggus/tests/test_aggregation.py | 79 +++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/biggus/__init__.py b/biggus/__init__.py index 9cf8410..79c5690 100644 --- a/biggus/__init__.py +++ b/biggus/__init__.py @@ -1613,7 +1613,6 @@ def mean(a, axis=None, mdtol=1): :rtype: Array """ - import pdb;pdb.set_trace() axes = _normalise_axis(axis, a) assert axes is not None and len(axes) == 1 dtype = (np.array([0], dtype=a.dtype) / 1.).dtype diff --git a/biggus/tests/test_aggregation.py b/biggus/tests/test_aggregation.py index 2f9d083..56a05b1 100644 --- a/biggus/tests/test_aggregation.py +++ b/biggus/tests/test_aggregation.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2013, Met Office +# (C) British Crown Copyright 2013 - 2014, Met Office # # This file is part of Biggus. # @@ -17,6 +17,8 @@ import unittest import numpy as np +import numpy.ma +import numpy.testing import biggus from biggus.tests import AccessCounter @@ -95,6 +97,81 @@ def test_add_non_supported_type(self): biggus.mean(range(10), axis=0) +class TestMdtol(unittest.TestCase): + def setUp(self): + self.data = np.ma.arange(12).reshape(3, 4) + self.data[2, 1:] = np.ma.masked + + def _test_mean_with_mdtol(self, data, axis, numpy_result, mdtol=None): + data = AccessCounter(data) + array = biggus.NumpyArrayAdapter(data) + + # Perform aggregation. + if mdtol is None: + # Allow testing of default when mdtol is None. + biggus_aggregation = biggus.mean(array, axis=axis) + else: + biggus_aggregation = biggus.mean(array, axis=axis, mdtol=mdtol) + + # Check the aggregation operation doesn't actually read any data. + self.assertTrue((data.counts == 0).all()) + + # Check results. + biggus_result = biggus_aggregation.masked_array() + # Check resolving `op_array` to a NumPy array only reads + # each relevant source value once. + self.assertTrue((data.counts <= 1).all()) + numpy_mask = np.ma.getmaskarray(numpy_result) + biggus_mask = np.ma.getmaskarray(biggus_result) + np.testing.assert_array_equal(biggus_mask, numpy_mask) + np.testing.assert_array_equal(biggus_result[~biggus_mask].data, + numpy_result[~numpy_mask].data) + + def test_mean_mdtol_default(self): + axis = 0 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected) + + def test_mean_mdtol_one(self): + axis = 0 + mdtol = 1 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_zero(self): + axis = 0 + mdtol = 0 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, True, True, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_below_axis_zero(self): + axis = 0 + mdtol = 0.32 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, True, True, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_above_axis_zero(self): + axis = 0 + mdtol = 0.34 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_below_axis_one(self): + axis = 1 + mdtol = 0.74 + expected = np.ma.mean(self.data, axis) + expected.mask = [False, False, True] + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + def test_mean_mdtol_fraction_above_axis_one(self): + axis = 1 + mdtol = 0.76 + expected = np.ma.mean(self.data, axis) + self._test_mean_with_mdtol(self.data, axis, expected, mdtol) + + class TestFlow(unittest.TestCase): def _test_flow(self, axis): data = np.arange(3 * 4 * 5, dtype='f4').reshape(3, 4, 5)