From f73fb5f461c4d4899d31cf75f30770ad7bcadf7e Mon Sep 17 00:00:00 2001
From: Ed Campbell <drescampbell@gmail.com>
Date: Wed, 11 Jun 2014 17:05:18 +0100
Subject: [PATCH 1/3] Added mdtol support for biggus.mean()

---
 biggus/__init__.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/biggus/__init__.py b/biggus/__init__.py
index 71456b6..70d06b2 100644
--- a/biggus/__init__.py
+++ b/biggus/__init__.py
@@ -1335,6 +1335,11 @@ def process_data(self, data):
 
 
 class _MeanStreamsHandler(_AggregationStreamsHandler):
+    def __init__(self, array, axis, mdtol=1):
+        # The mdtol kwarg not applicable to non-masked arrays
+        # so it is ignored.
+        super(_MeanStreamsHandler, self).__init__(array, axis)
+
     def bootstrap(self, shape):
         self.running_total = np.zeros(shape, dtype=self.array.dtype)
 
@@ -1351,8 +1356,13 @@ def process_data(self, data):
 
 
 class _MeanMaskedStreamsHandler(_AggregationStreamsHandler):
+    def __init__(self, array, axis, mdtol=1):
+        self._mdtol = mdtol
+        super(_MeanMaskedStreamsHandler, self).__init__(array, axis)
+
     def bootstrap(self, shape):
         self.running_count = np.zeros(shape, dtype=self.array.dtype)
+        self.running_masked_count = np.zeros(shape, dtype=self.array.dtype)
         self.running_total = np.zeros(shape, dtype=self.array.dtype)
 
     def finalise(self):
@@ -1363,11 +1373,19 @@ def finalise(self):
         # Promote array-scalar to 0-dimensional array.
         if array.ndim == 0:
             array = np.ma.array(array)
+        # Apply masked/missing data threshold (mdtol).
+        if self._mdtol < 1:
+            mask_update = np.true_divide(self.running_masked_count,
+                                         self.running_masked_count +
+                                         self.running_count) > self._mdtol
+            array.mask |= mask_update
+
         chunk = Chunk(self.current_keys, array)
         return chunk
 
     def process_data(self, data):
         self.running_count += np.ma.count(data, axis=self.axis)
+        self.running_masked_count += np.ma.count_masked(data, axis=self.axis)
         self.running_total += np.sum(data, axis=self.axis)
 
 
@@ -1569,7 +1587,7 @@ def _normalise_axis(axis, array):
     return axes
 
 
-def mean(a, axis=None):
+def mean(a, axis=None, mdtol=1):
     """
     Request the mean of an Array over any number of axes.
 
@@ -1583,16 +1601,26 @@ def mean(a, axis=None):
                  If axis is a tuple of ints, the operation is performed
                  over multiple axes.
     :type axis: None, or int, or iterable of ints.
+    :param float mdtol: Tolerance of missing data. The value in each
+                        element of the resulting array will be masked if the
+                        fraction of masked data contributing to that element
+                        exceeds mdtol. mdtol=0 means no missing data is
+                        tolerated while mdtol=1 will mean the resulting
+                        element will be masked if and only if all the
+                        contributing elements of the source array are masked.
+                        Defaults to 1.
     :return: The Array representing the requested mean.
     :rtype: Array
 
     """
+    import pdb;pdb.set_trace()
     axes = _normalise_axis(axis, a)
     assert axes is not None and len(axes) == 1
     dtype = (np.array([0], dtype=a.dtype) / 1.).dtype
+    kwargs = dict(mdtol=mdtol)
     return _Aggregation(a, axes[0],
                         _MeanStreamsHandler, _MeanMaskedStreamsHandler,
-                        dtype, {})
+                        dtype, kwargs)
 
 
 def std(a, axis=None, ddof=0):

From 00439ad05548228a6c31978fbb4afc68c380ae0e Mon Sep 17 00:00:00 2001
From: Ed Campbell <drescampbell@gmail.com>
Date: Thu, 12 Jun 2014 14:43:07 +0100
Subject: [PATCH 2/3] Removed default val for mdtol from Handlers

---
 biggus/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/biggus/__init__.py b/biggus/__init__.py
index 70d06b2..9cf8410 100644
--- a/biggus/__init__.py
+++ b/biggus/__init__.py
@@ -1335,8 +1335,8 @@ def process_data(self, data):
 
 
 class _MeanStreamsHandler(_AggregationStreamsHandler):
-    def __init__(self, array, axis, mdtol=1):
-        # The mdtol kwarg not applicable to non-masked arrays
+    def __init__(self, array, axis, mdtol):
+        # The mdtol argument is not applicable to non-masked arrays
         # so it is ignored.
         super(_MeanStreamsHandler, self).__init__(array, axis)
 
@@ -1356,7 +1356,7 @@ def process_data(self, data):
 
 
 class _MeanMaskedStreamsHandler(_AggregationStreamsHandler):
-    def __init__(self, array, axis, mdtol=1):
+    def __init__(self, array, axis, mdtol):
         self._mdtol = mdtol
         super(_MeanMaskedStreamsHandler, self).__init__(array, axis)
 

From b1666903449473f7010b29774d6e241979b2121f Mon Sep 17 00:00:00 2001
From: Ed Campbell <drescampbell@gmail.com>
Date: Thu, 12 Jun 2014 16:00:14 +0100
Subject: [PATCH 3/3] tests for mdtol

---
 biggus/__init__.py               |  1 -
 biggus/tests/test_aggregation.py | 79 +++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/biggus/__init__.py b/biggus/__init__.py
index 9cf8410..79c5690 100644
--- a/biggus/__init__.py
+++ b/biggus/__init__.py
@@ -1613,7 +1613,6 @@ def mean(a, axis=None, mdtol=1):
     :rtype: Array
 
     """
-    import pdb;pdb.set_trace()
     axes = _normalise_axis(axis, a)
     assert axes is not None and len(axes) == 1
     dtype = (np.array([0], dtype=a.dtype) / 1.).dtype
diff --git a/biggus/tests/test_aggregation.py b/biggus/tests/test_aggregation.py
index 2f9d083..56a05b1 100644
--- a/biggus/tests/test_aggregation.py
+++ b/biggus/tests/test_aggregation.py
@@ -1,4 +1,4 @@
-# (C) British Crown Copyright 2013, Met Office
+# (C) British Crown Copyright 2013 - 2014, Met Office
 #
 # This file is part of Biggus.
 #
@@ -17,6 +17,8 @@
 import unittest
 
 import numpy as np
+import numpy.ma
+import numpy.testing
 
 import biggus
 from biggus.tests import AccessCounter
@@ -95,6 +97,81 @@ def test_add_non_supported_type(self):
             biggus.mean(range(10), axis=0)
 
 
+class TestMdtol(unittest.TestCase):
+    def setUp(self):
+        self.data = np.ma.arange(12).reshape(3, 4)
+        self.data[2, 1:] = np.ma.masked
+
+    def _test_mean_with_mdtol(self, data, axis, numpy_result, mdtol=None):
+        data = AccessCounter(data)
+        array = biggus.NumpyArrayAdapter(data)
+
+        # Perform aggregation.
+        if mdtol is None:
+            # Allow testing of default when mdtol is None.
+            biggus_aggregation = biggus.mean(array, axis=axis)
+        else:
+            biggus_aggregation = biggus.mean(array, axis=axis, mdtol=mdtol)
+
+        # Check the aggregation operation doesn't actually read any data.
+        self.assertTrue((data.counts == 0).all())
+
+        # Check results.
+        biggus_result = biggus_aggregation.masked_array()
+        # Check resolving `op_array` to a NumPy array only reads
+        # each relevant source value once.
+        self.assertTrue((data.counts <= 1).all())
+        numpy_mask = np.ma.getmaskarray(numpy_result)
+        biggus_mask = np.ma.getmaskarray(biggus_result)
+        np.testing.assert_array_equal(biggus_mask, numpy_mask)
+        np.testing.assert_array_equal(biggus_result[~biggus_mask].data,
+                                      numpy_result[~numpy_mask].data)
+
+    def test_mean_mdtol_default(self):
+        axis = 0
+        expected = np.ma.mean(self.data, axis)
+        self._test_mean_with_mdtol(self.data, axis, expected)
+
+    def test_mean_mdtol_one(self):
+        axis = 0
+        mdtol = 1
+        expected = np.ma.mean(self.data, axis)
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+    def test_mean_mdtol_zero(self):
+        axis = 0
+        mdtol = 0
+        expected = np.ma.mean(self.data, axis)
+        expected.mask = [False, True, True, True]
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+    def test_mean_mdtol_fraction_below_axis_zero(self):
+        axis = 0
+        mdtol = 0.32
+        expected = np.ma.mean(self.data, axis)
+        expected.mask = [False, True, True, True]
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+    def test_mean_mdtol_fraction_above_axis_zero(self):
+        axis = 0
+        mdtol = 0.34
+        expected = np.ma.mean(self.data, axis)
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+    def test_mean_mdtol_fraction_below_axis_one(self):
+        axis = 1
+        mdtol = 0.74
+        expected = np.ma.mean(self.data, axis)
+        expected.mask = [False, False, True]
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+    def test_mean_mdtol_fraction_above_axis_one(self):
+        axis = 1
+        mdtol = 0.76
+        expected = np.ma.mean(self.data, axis)
+        self._test_mean_with_mdtol(self.data, axis, expected, mdtol)
+
+
 class TestFlow(unittest.TestCase):
     def _test_flow(self, axis):
         data = np.arange(3 * 4 * 5, dtype='f4').reshape(3, 4, 5)