Group By - fix failing/wrong statistics on datetime data

biolab · Sep 30, 2022 · 2400565 · 2400565
1 parent 2979b21
commit 2400565
Show file tree

Hide file tree

Showing 3 changed files with 276 additions and 10 deletions.
diff --git a/Orange/widgets/data/owgroupby.py b/Orange/widgets/data/owgroupby.py
@@ -26,6 +26,7 @@
 from orangewidget.utils.signals import Input, Output
 from orangewidget.utils import enum_as_int
 from orangewidget.widget import Msg
+from pandas.core.dtypes.common import is_datetime64_any_dtype
 
 from Orange.data import (
     ContinuousVariable,
@@ -56,26 +57,59 @@ def concatenate(x):
     return " ".join(str(v) for v in x if not pd.isnull(v) and len(str(v)) > 0)
 
 
+def std(s):
+    """
+    Std that also handle time variable. Pandas's std return Timedelta object in
+    case of datetime columns - transform TimeDelta to seconds
+    """
+    std_ = s.std()
+    if isinstance(std_, pd.Timedelta):
+        return std_.total_seconds()
+    # std returns NaT when cannot compute value - change it to nan to keep colum numeric
+    return nan if pd.isna(std_) else std_
+
+
+def var(s):
+    """
+    Variance that also handle time variable. Pandas's variance function somehow
+    doesn't support DateTimeArray - this function fist converts datetime series
+    to UNIX epoch and then computes variance
+    """
+    if is_datetime64_any_dtype(s):
+        initial_ts = pd.Timestamp("1970-01-01", tz=None if s.dt.tz is None else "UTC")
+        if s.dt.tz is not None:
+            s = s.tz_convert("UTC")
+        s = (s - initial_ts) / pd.Timedelta("1s")
+    var_ = s.var()
+    return var_.total_seconds() if isinstance(var_, pd.Timedelta) else var_
+
+
+def span(s):
+    """
+    Span that also handle time variable. Time substitution return Timedelta
+    object in case of datetime columns - transform TimeDelta to seconds
+    """
+    span_ = pd.Series.max(s) - pd.Series.min(s)
+    return span_.total_seconds() if isinstance(span_, pd.Timedelta) else span_
+
+
 AGGREGATIONS = {
     "Mean": Aggregation("mean", {ContinuousVariable, TimeVariable}),
     "Median": Aggregation("median", {ContinuousVariable, TimeVariable}),
     "Mode": Aggregation(
         lambda x: pd.Series.mode(x).get(0, nan),
         {ContinuousVariable, DiscreteVariable, TimeVariable}
     ),
-    "Standard deviation": Aggregation("std", {ContinuousVariable, TimeVariable}),
-    "Variance": Aggregation("var", {ContinuousVariable, TimeVariable}),
-    "Sum": Aggregation("sum", {ContinuousVariable, TimeVariable}),
+    "Standard deviation": Aggregation(std, {ContinuousVariable, TimeVariable}),
+    "Variance": Aggregation(var, {ContinuousVariable, TimeVariable}),
+    "Sum": Aggregation("sum", {ContinuousVariable}),
     "Concatenate": Aggregation(
         concatenate,
         {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable},
     ),
     "Min. value": Aggregation("min", {ContinuousVariable, TimeVariable}),
     "Max. value": Aggregation("max", {ContinuousVariable, TimeVariable}),
-    "Span": Aggregation(
-        lambda x: pd.Series.max(x) - pd.Series.min(x),
-        {ContinuousVariable, TimeVariable},
-    ),
+    "Span": Aggregation(span, {ContinuousVariable, TimeVariable}),
     "First value": Aggregation(
         "first", {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable}
     ),
@@ -506,6 +540,17 @@ def __aggregation_compatible(agg, attr):
         """Check a compatibility of aggregation with the variable"""
         return type(attr) in AGGREGATIONS[agg].types
 
+    @classmethod
+    def migrate_context(cls, context, _):
+        """
+        Before widget allowed using Sum on Time variable, now it is forbidden.
+        This function removes Sum from the context for TimeVariables (104)
+        """
+        for var_, v in context.values["aggregations"][0].items():
+            if len(var_) == 2:
+                if var_[1] == 104:
+                    v.discard("Sum")
+
 
 if __name__ == "__main__":
     # pylint: disable=ungrouped-imports

diff --git a/Orange/widgets/data/tests/test_oweditdomain.py b/Orange/widgets/data/tests/test_oweditdomain.py
@@ -962,6 +962,7 @@ def test_raise_pandas_version(self):
         When this test start to fail:
         - remove this test
         - remove if clause in datetime_to_epoch function and supporting comments
+        - remove same if clause in var function in owgroupby (line 77, 78)
         - set pandas dependency version to pandas>=1.4
         """
         from datetime import datetime

diff --git a/Orange/widgets/data/tests/test_owgroupby.py b/Orange/widgets/data/tests/test_owgroupby.py
@@ -14,6 +14,9 @@
     table_to_frame,
     Domain,
     ContinuousVariable,
+    DiscreteVariable,
+    TimeVariable,
+    StringVariable,
 )
 from Orange.data.tests.test_aggregate import create_sample_data
 from Orange.widgets.data.owgroupby import OWGroupBy
@@ -665,6 +668,28 @@ def test_context(self):
             self.widget.aggregations,
         )
 
+    def test_context_time_variable(self):
+        """
+        Test migrate_context which removes sum for TimeVariable since
+        GroupBy does not support it anymore for TimeVariable
+        """
+        tv = TimeVariable("T", have_time=True, have_date=True)
+        data = Table.from_numpy(
+            Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
+            np.array([[0.0, 0.0], [0, 10], [0, 20], [1, 500], [1, 1000]]),
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.widget.aggregations[tv].add("Sum")
+        self.widget.aggregations[tv].add("Median")
+        self.send_signal(self.widget.Inputs.data, self.iris)
+
+        widget = self.create_widget(
+            OWGroupBy,
+            stored_settings=self.widget.settingsHandler.pack_data(self.widget),
+        )
+        self.send_signal(widget.Inputs.data, data, widget=widget)
+        self.assertSetEqual(widget.aggregations[tv], {"Mean", "Median"})
+
     @patch(
         "Orange.data.aggregate.OrangeTableGroupBy.aggregate",
         Mock(side_effect=ValueError("Test unexpected err")),
@@ -690,16 +715,211 @@ def test_time_variable(self):
 
         # time variable as a group by variable
         self.send_signal(self.widget.Inputs.data, data)
-        self._set_selection(self.widget.gb_attrs_view, [1])
+        self._set_selection(self.widget.gb_attrs_view, [3])
         output = self.get_output(self.widget.Outputs.data)
         self.assertEqual(3, len(output))
 
         # time variable as a grouped variable
-        self.send_signal(self.widget.Inputs.data, data)
-        self._set_selection(self.widget.gb_attrs_view, [5])
+        attributes = [data.domain["c2"], data.domain["d2"]]
+        self.send_signal(self.widget.Inputs.data, data[:, attributes])
+        self._set_selection(self.widget.gb_attrs_view, [1])  # d2
+        # check all aggregations
+        self.assert_aggregations_equal(["Mean", "Mode"])
+        self.select_table_rows(self.widget.agg_table_view, [0])  # c2
+        for cb in self.widget.agg_checkboxes.values():
+            if cb.text() != "Mean":
+                cb.click()
+        self.assert_aggregations_equal(["Mean, Median, Mode and 12 more", "Mode"])
         output = self.get_output(self.widget.Outputs.data)
         self.assertEqual(2, len(output))
 
+    def test_time_variable_results(self):
+        data = Table.from_numpy(
+            Domain(
+                [
+                    DiscreteVariable("G", values=["G1", "G2", "G3"]),
+                    TimeVariable("T", have_time=True, have_date=True),
+                ]
+            ),
+            np.array([[0.0, 0], [0, 10], [0, 20], [1, 500], [1, 1000], [2, 1]]),
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+
+        # disable aggregating G
+        self.select_table_rows(self.widget.agg_table_view, [0])  # T
+        self.widget.agg_checkboxes["Mode"].click()
+        # select all possible aggregations for T
+        self.select_table_rows(self.widget.agg_table_view, [1])  # T
+        for cb in self.widget.agg_checkboxes.values():
+            if cb.text() != "Mean":
+                cb.click()
+        self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])
+
+        expected_df = pd.DataFrame(
+            {
+                "T - Mean": [
+                    "1970-01-01 00:00:10",
+                    "1970-01-01 00:12:30",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Median": [
+                    "1970-01-01 00:00:10",
+                    "1970-01-01 00:12:30",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Mode": [
+                    "1970-01-01 00:00:00",
+                    "1970-01-01 00:08:20",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Standard deviation": [10, 353.5533905932738, np.nan],
+                "T - Variance": [100, 125000, np.nan],
+                "T - Min. value": [
+                    "1970-01-01 00:00:00",
+                    "1970-01-01 00:08:20",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Max. value": [
+                    "1970-01-01 00:00:20",
+                    "1970-01-01 00:16:40",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Span": [20, 500, 0],
+                "T - First value": [
+                    "1970-01-01 00:00:00",
+                    "1970-01-01 00:08:20",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Last value": [
+                    "1970-01-01 00:00:20",
+                    "1970-01-01 00:16:40",
+                    "1970-01-01 00:00:01",
+                ],
+                "T - Count defined": [3, 2, 1],
+                "T - Count": [3, 2, 1],
+                "T - Proportion defined": [1, 1, 1],
+                "T - Concatenate": [
+                    "1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
+                    "1970-01-01 00:08:20 1970-01-01 00:16:40",
+                    "1970-01-01 00:00:01",
+                ],
+                "G": ["G1", "G2", "G3"],
+            }
+        )
+        df_col = [
+            "T - Mean",
+            "T - Median",
+            "T - Mode",
+            "T - Min. value",
+            "T - Max. value",
+            "T - First value",
+            "T - Last value",
+        ]
+        expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
+        output = self.get_output(self.widget.Outputs.data)
+        output_df = table_to_frame(output, include_metas=True)
+        # remove random since it is not possible to test
+        output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]
+
+        pd.testing.assert_frame_equal(
+            output_df,
+            expected_df,
+            check_dtype=False,
+            check_column_type=False,
+            check_categorical=False,
+            atol=1e-3,
+        )
+        expected_attributes = (
+            TimeVariable("T - Mean", have_date=1, have_time=1),
+            TimeVariable("T - Median", have_date=1, have_time=1),
+            TimeVariable("T - Mode", have_date=1, have_time=1),
+            ContinuousVariable(name="T - Standard deviation"),
+            ContinuousVariable(name="T - Variance"),
+            TimeVariable("T - Min. value", have_date=1, have_time=1),
+            TimeVariable("T - Max. value", have_date=1, have_time=1),
+            ContinuousVariable(name="T - Span"),
+            TimeVariable("T - First value", have_date=1, have_time=1),
+            TimeVariable("T - Last value", have_date=1, have_time=1),
+            TimeVariable("T - Random value", have_date=1, have_time=1),
+            ContinuousVariable(name="T - Count defined"),
+            ContinuousVariable(name="T - Count"),
+            ContinuousVariable(name="T - Proportion defined"),
+        )
+        expected_metas = (
+            StringVariable(name="T - Concatenate"),
+            DiscreteVariable(name="G", values=("G1", "G2", "G3")),
+        )
+        self.assertTupleEqual(output.domain.attributes, expected_attributes)
+        self.assertTupleEqual(output.domain.metas, expected_metas)
+
+    def test_tz_time_variable_results(self):
+        """ Test results in case of timezoned time variable"""
+        tv = TimeVariable("T", have_time=True, have_date=True)
+        data = Table.from_numpy(
+            Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
+            np.array([[0.0, tv.parse("1970-01-01 01:00:00+01:00")],
+                      [0, tv.parse("1970-01-01 01:00:10+01:00")],
+                     [0, tv.parse("1970-01-01 01:00:20+01:00")]]),
+        )
+
+        self.send_signal(self.widget.Inputs.data, data)
+
+        # disable aggregating G
+        self.select_table_rows(self.widget.agg_table_view, [0])  # T
+        self.widget.agg_checkboxes["Mode"].click()
+        # select all possible aggregations for T
+        self.select_table_rows(self.widget.agg_table_view, [1])  # T
+        for cb in self.widget.agg_checkboxes.values():
+            if cb.text() != "Mean":
+                cb.click()
+        self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])
+
+        expected_df = pd.DataFrame(
+            {
+                "T - Mean": ["1970-01-01 00:00:10"],
+                "T - Median": ["1970-01-01 00:00:10"],
+                "T - Mode": ["1970-01-01 00:00:00"],
+                "T - Standard deviation": [10],
+                "T - Variance": [100],
+                "T - Min. value": ["1970-01-01 00:00:00"],
+                "T - Max. value": ["1970-01-01 00:00:20"],
+                "T - Span": [20, ],
+                "T - First value": ["1970-01-01 00:00:00"],
+                "T - Last value": ["1970-01-01 00:00:20"],
+                "T - Count defined": [3],
+                "T - Count": [3],
+                "T - Proportion defined": [1],
+                "T - Concatenate": [
+                    "1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
+                ],
+                "G": ["G1"],
+            }
+        )
+        df_col = [
+            "T - Mean",
+            "T - Median",
+            "T - Mode",
+            "T - Min. value",
+            "T - Max. value",
+            "T - First value",
+            "T - Last value",
+        ]
+        expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
+        output_df = table_to_frame(
+            self.get_output(self.widget.Outputs.data), include_metas=True
+        )
+        # remove random since it is not possible to test
+        output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]
+
+        pd.testing.assert_frame_equal(
+            output_df,
+            expected_df,
+            check_dtype=False,
+            check_column_type=False,
+            check_categorical=False,
+            atol=1e-3,
+        )
+
     def test_only_nan_in_group(self):
         data = Table(
             Domain([ContinuousVariable("A"), ContinuousVariable("B")]),