From 2400565b5a8e6bb957967aae370c54c1a56f2223 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 9 Sep 2022 13:34:26 +0200
Subject: [PATCH] Group By - fix failing/wrong statistics on datetime data
---
Orange/widgets/data/owgroupby.py | 59 ++++-
.../widgets/data/tests/test_oweditdomain.py | 1 +
Orange/widgets/data/tests/test_owgroupby.py | 226 +++++++++++++++++-
3 files changed, 276 insertions(+), 10 deletions(-)
diff --git a/Orange/widgets/data/owgroupby.py b/Orange/widgets/data/owgroupby.py
index c5f4916b9f1..6e654df5c12 100644
--- a/Orange/widgets/data/owgroupby.py
+++ b/Orange/widgets/data/owgroupby.py
@@ -26,6 +26,7 @@
from orangewidget.utils.signals import Input, Output
from orangewidget.utils import enum_as_int
from orangewidget.widget import Msg
+from pandas.core.dtypes.common import is_datetime64_any_dtype
from Orange.data import (
ContinuousVariable,
@@ -56,6 +57,42 @@ def concatenate(x):
return " ".join(str(v) for v in x if not pd.isnull(v) and len(str(v)) > 0)
+def std(s):
+ """
+ Std that also handle time variable. Pandas's std return Timedelta object in
+ case of datetime columns - transform TimeDelta to seconds
+ """
+ std_ = s.std()
+ if isinstance(std_, pd.Timedelta):
+ return std_.total_seconds()
+ # std returns NaT when cannot compute value - change it to nan to keep colum numeric
+ return nan if pd.isna(std_) else std_
+
+
+def var(s):
+ """
+ Variance that also handle time variable. Pandas's variance function somehow
+ doesn't support DateTimeArray - this function fist converts datetime series
+ to UNIX epoch and then computes variance
+ """
+ if is_datetime64_any_dtype(s):
+ initial_ts = pd.Timestamp("1970-01-01", tz=None if s.dt.tz is None else "UTC")
+ if s.dt.tz is not None:
+ s = s.tz_convert("UTC")
+ s = (s - initial_ts) / pd.Timedelta("1s")
+ var_ = s.var()
+ return var_.total_seconds() if isinstance(var_, pd.Timedelta) else var_
+
+
+def span(s):
+ """
+ Span that also handle time variable. Time substitution return Timedelta
+ object in case of datetime columns - transform TimeDelta to seconds
+ """
+ span_ = pd.Series.max(s) - pd.Series.min(s)
+ return span_.total_seconds() if isinstance(span_, pd.Timedelta) else span_
+
+
AGGREGATIONS = {
"Mean": Aggregation("mean", {ContinuousVariable, TimeVariable}),
"Median": Aggregation("median", {ContinuousVariable, TimeVariable}),
@@ -63,19 +100,16 @@ def concatenate(x):
lambda x: pd.Series.mode(x).get(0, nan),
{ContinuousVariable, DiscreteVariable, TimeVariable}
),
- "Standard deviation": Aggregation("std", {ContinuousVariable, TimeVariable}),
- "Variance": Aggregation("var", {ContinuousVariable, TimeVariable}),
- "Sum": Aggregation("sum", {ContinuousVariable, TimeVariable}),
+ "Standard deviation": Aggregation(std, {ContinuousVariable, TimeVariable}),
+ "Variance": Aggregation(var, {ContinuousVariable, TimeVariable}),
+ "Sum": Aggregation("sum", {ContinuousVariable}),
"Concatenate": Aggregation(
concatenate,
{ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable},
),
"Min. value": Aggregation("min", {ContinuousVariable, TimeVariable}),
"Max. value": Aggregation("max", {ContinuousVariable, TimeVariable}),
- "Span": Aggregation(
- lambda x: pd.Series.max(x) - pd.Series.min(x),
- {ContinuousVariable, TimeVariable},
- ),
+ "Span": Aggregation(span, {ContinuousVariable, TimeVariable}),
"First value": Aggregation(
"first", {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable}
),
@@ -506,6 +540,17 @@ def __aggregation_compatible(agg, attr):
"""Check a compatibility of aggregation with the variable"""
return type(attr) in AGGREGATIONS[agg].types
+ @classmethod
+ def migrate_context(cls, context, _):
+ """
+ Before widget allowed using Sum on Time variable, now it is forbidden.
+ This function removes Sum from the context for TimeVariables (104)
+ """
+ for var_, v in context.values["aggregations"][0].items():
+ if len(var_) == 2:
+ if var_[1] == 104:
+ v.discard("Sum")
+
if __name__ == "__main__":
# pylint: disable=ungrouped-imports
diff --git a/Orange/widgets/data/tests/test_oweditdomain.py b/Orange/widgets/data/tests/test_oweditdomain.py
index 01b9d5fa445..0881fed3a7d 100644
--- a/Orange/widgets/data/tests/test_oweditdomain.py
+++ b/Orange/widgets/data/tests/test_oweditdomain.py
@@ -962,6 +962,7 @@ def test_raise_pandas_version(self):
When this test start to fail:
- remove this test
- remove if clause in datetime_to_epoch function and supporting comments
+ - remove same if clause in var function in owgroupby (line 77, 78)
- set pandas dependency version to pandas>=1.4
"""
from datetime import datetime
diff --git a/Orange/widgets/data/tests/test_owgroupby.py b/Orange/widgets/data/tests/test_owgroupby.py
index 3ce1c13bb21..c6bde34bcaa 100644
--- a/Orange/widgets/data/tests/test_owgroupby.py
+++ b/Orange/widgets/data/tests/test_owgroupby.py
@@ -14,6 +14,9 @@
table_to_frame,
Domain,
ContinuousVariable,
+ DiscreteVariable,
+ TimeVariable,
+ StringVariable,
)
from Orange.data.tests.test_aggregate import create_sample_data
from Orange.widgets.data.owgroupby import OWGroupBy
@@ -665,6 +668,28 @@ def test_context(self):
self.widget.aggregations,
)
+ def test_context_time_variable(self):
+ """
+ Test migrate_context which removes sum for TimeVariable since
+ GroupBy does not support it anymore for TimeVariable
+ """
+ tv = TimeVariable("T", have_time=True, have_date=True)
+ data = Table.from_numpy(
+ Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
+ np.array([[0.0, 0.0], [0, 10], [0, 20], [1, 500], [1, 1000]]),
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.widget.aggregations[tv].add("Sum")
+ self.widget.aggregations[tv].add("Median")
+ self.send_signal(self.widget.Inputs.data, self.iris)
+
+ widget = self.create_widget(
+ OWGroupBy,
+ stored_settings=self.widget.settingsHandler.pack_data(self.widget),
+ )
+ self.send_signal(widget.Inputs.data, data, widget=widget)
+ self.assertSetEqual(widget.aggregations[tv], {"Mean", "Median"})
+
@patch(
"Orange.data.aggregate.OrangeTableGroupBy.aggregate",
Mock(side_effect=ValueError("Test unexpected err")),
@@ -690,16 +715,211 @@ def test_time_variable(self):
# time variable as a group by variable
self.send_signal(self.widget.Inputs.data, data)
- self._set_selection(self.widget.gb_attrs_view, [1])
+ self._set_selection(self.widget.gb_attrs_view, [3])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(3, len(output))
# time variable as a grouped variable
- self.send_signal(self.widget.Inputs.data, data)
- self._set_selection(self.widget.gb_attrs_view, [5])
+ attributes = [data.domain["c2"], data.domain["d2"]]
+ self.send_signal(self.widget.Inputs.data, data[:, attributes])
+ self._set_selection(self.widget.gb_attrs_view, [1]) # d2
+ # check all aggregations
+ self.assert_aggregations_equal(["Mean", "Mode"])
+ self.select_table_rows(self.widget.agg_table_view, [0]) # c2
+ for cb in self.widget.agg_checkboxes.values():
+ if cb.text() != "Mean":
+ cb.click()
+ self.assert_aggregations_equal(["Mean, Median, Mode and 12 more", "Mode"])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(2, len(output))
+ def test_time_variable_results(self):
+ data = Table.from_numpy(
+ Domain(
+ [
+ DiscreteVariable("G", values=["G1", "G2", "G3"]),
+ TimeVariable("T", have_time=True, have_date=True),
+ ]
+ ),
+ np.array([[0.0, 0], [0, 10], [0, 20], [1, 500], [1, 1000], [2, 1]]),
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+
+ # disable aggregating G
+ self.select_table_rows(self.widget.agg_table_view, [0]) # T
+ self.widget.agg_checkboxes["Mode"].click()
+ # select all possible aggregations for T
+ self.select_table_rows(self.widget.agg_table_view, [1]) # T
+ for cb in self.widget.agg_checkboxes.values():
+ if cb.text() != "Mean":
+ cb.click()
+ self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])
+
+ expected_df = pd.DataFrame(
+ {
+ "T - Mean": [
+ "1970-01-01 00:00:10",
+ "1970-01-01 00:12:30",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Median": [
+ "1970-01-01 00:00:10",
+ "1970-01-01 00:12:30",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Mode": [
+ "1970-01-01 00:00:00",
+ "1970-01-01 00:08:20",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Standard deviation": [10, 353.5533905932738, np.nan],
+ "T - Variance": [100, 125000, np.nan],
+ "T - Min. value": [
+ "1970-01-01 00:00:00",
+ "1970-01-01 00:08:20",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Max. value": [
+ "1970-01-01 00:00:20",
+ "1970-01-01 00:16:40",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Span": [20, 500, 0],
+ "T - First value": [
+ "1970-01-01 00:00:00",
+ "1970-01-01 00:08:20",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Last value": [
+ "1970-01-01 00:00:20",
+ "1970-01-01 00:16:40",
+ "1970-01-01 00:00:01",
+ ],
+ "T - Count defined": [3, 2, 1],
+ "T - Count": [3, 2, 1],
+ "T - Proportion defined": [1, 1, 1],
+ "T - Concatenate": [
+ "1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
+ "1970-01-01 00:08:20 1970-01-01 00:16:40",
+ "1970-01-01 00:00:01",
+ ],
+ "G": ["G1", "G2", "G3"],
+ }
+ )
+ df_col = [
+ "T - Mean",
+ "T - Median",
+ "T - Mode",
+ "T - Min. value",
+ "T - Max. value",
+ "T - First value",
+ "T - Last value",
+ ]
+ expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
+ output = self.get_output(self.widget.Outputs.data)
+ output_df = table_to_frame(output, include_metas=True)
+ # remove random since it is not possible to test
+ output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]
+
+ pd.testing.assert_frame_equal(
+ output_df,
+ expected_df,
+ check_dtype=False,
+ check_column_type=False,
+ check_categorical=False,
+ atol=1e-3,
+ )
+ expected_attributes = (
+ TimeVariable("T - Mean", have_date=1, have_time=1),
+ TimeVariable("T - Median", have_date=1, have_time=1),
+ TimeVariable("T - Mode", have_date=1, have_time=1),
+ ContinuousVariable(name="T - Standard deviation"),
+ ContinuousVariable(name="T - Variance"),
+ TimeVariable("T - Min. value", have_date=1, have_time=1),
+ TimeVariable("T - Max. value", have_date=1, have_time=1),
+ ContinuousVariable(name="T - Span"),
+ TimeVariable("T - First value", have_date=1, have_time=1),
+ TimeVariable("T - Last value", have_date=1, have_time=1),
+ TimeVariable("T - Random value", have_date=1, have_time=1),
+ ContinuousVariable(name="T - Count defined"),
+ ContinuousVariable(name="T - Count"),
+ ContinuousVariable(name="T - Proportion defined"),
+ )
+ expected_metas = (
+ StringVariable(name="T - Concatenate"),
+ DiscreteVariable(name="G", values=("G1", "G2", "G3")),
+ )
+ self.assertTupleEqual(output.domain.attributes, expected_attributes)
+ self.assertTupleEqual(output.domain.metas, expected_metas)
+
+ def test_tz_time_variable_results(self):
+ """ Test results in case of timezoned time variable"""
+ tv = TimeVariable("T", have_time=True, have_date=True)
+ data = Table.from_numpy(
+ Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
+ np.array([[0.0, tv.parse("1970-01-01 01:00:00+01:00")],
+ [0, tv.parse("1970-01-01 01:00:10+01:00")],
+ [0, tv.parse("1970-01-01 01:00:20+01:00")]]),
+ )
+
+ self.send_signal(self.widget.Inputs.data, data)
+
+ # disable aggregating G
+ self.select_table_rows(self.widget.agg_table_view, [0]) # T
+ self.widget.agg_checkboxes["Mode"].click()
+ # select all possible aggregations for T
+ self.select_table_rows(self.widget.agg_table_view, [1]) # T
+ for cb in self.widget.agg_checkboxes.values():
+ if cb.text() != "Mean":
+ cb.click()
+ self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])
+
+ expected_df = pd.DataFrame(
+ {
+ "T - Mean": ["1970-01-01 00:00:10"],
+ "T - Median": ["1970-01-01 00:00:10"],
+ "T - Mode": ["1970-01-01 00:00:00"],
+ "T - Standard deviation": [10],
+ "T - Variance": [100],
+ "T - Min. value": ["1970-01-01 00:00:00"],
+ "T - Max. value": ["1970-01-01 00:00:20"],
+ "T - Span": [20, ],
+ "T - First value": ["1970-01-01 00:00:00"],
+ "T - Last value": ["1970-01-01 00:00:20"],
+ "T - Count defined": [3],
+ "T - Count": [3],
+ "T - Proportion defined": [1],
+ "T - Concatenate": [
+ "1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
+ ],
+ "G": ["G1"],
+ }
+ )
+ df_col = [
+ "T - Mean",
+ "T - Median",
+ "T - Mode",
+ "T - Min. value",
+ "T - Max. value",
+ "T - First value",
+ "T - Last value",
+ ]
+ expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
+ output_df = table_to_frame(
+ self.get_output(self.widget.Outputs.data), include_metas=True
+ )
+ # remove random since it is not possible to test
+ output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]
+
+ pd.testing.assert_frame_equal(
+ output_df,
+ expected_df,
+ check_dtype=False,
+ check_column_type=False,
+ check_categorical=False,
+ atol=1e-3,
+ )
+
def test_only_nan_in_group(self):
data = Table(
Domain([ContinuousVariable("A"), ContinuousVariable("B")]),