Skip to content

Commit

Permalink
Group By - fix failing/wrong statistics on datetime data
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 30, 2022
1 parent 2979b21 commit 2400565
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 10 deletions.
59 changes: 52 additions & 7 deletions Orange/widgets/data/owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from orangewidget.utils.signals import Input, Output
from orangewidget.utils import enum_as_int
from orangewidget.widget import Msg
from pandas.core.dtypes.common import is_datetime64_any_dtype

from Orange.data import (
ContinuousVariable,
Expand Down Expand Up @@ -56,26 +57,59 @@ def concatenate(x):
return " ".join(str(v) for v in x if not pd.isnull(v) and len(str(v)) > 0)


def std(s):
"""
Std that also handle time variable. Pandas's std return Timedelta object in
case of datetime columns - transform TimeDelta to seconds
"""
std_ = s.std()
if isinstance(std_, pd.Timedelta):
return std_.total_seconds()
# std returns NaT when cannot compute value - change it to nan to keep colum numeric
return nan if pd.isna(std_) else std_


def var(s):
"""
Variance that also handle time variable. Pandas's variance function somehow
doesn't support DateTimeArray - this function fist converts datetime series
to UNIX epoch and then computes variance
"""
if is_datetime64_any_dtype(s):
initial_ts = pd.Timestamp("1970-01-01", tz=None if s.dt.tz is None else "UTC")
if s.dt.tz is not None:
s = s.tz_convert("UTC")
s = (s - initial_ts) / pd.Timedelta("1s")
var_ = s.var()
return var_.total_seconds() if isinstance(var_, pd.Timedelta) else var_


def span(s):
"""
Span that also handle time variable. Time substitution return Timedelta
object in case of datetime columns - transform TimeDelta to seconds
"""
span_ = pd.Series.max(s) - pd.Series.min(s)
return span_.total_seconds() if isinstance(span_, pd.Timedelta) else span_


AGGREGATIONS = {
"Mean": Aggregation("mean", {ContinuousVariable, TimeVariable}),
"Median": Aggregation("median", {ContinuousVariable, TimeVariable}),
"Mode": Aggregation(
lambda x: pd.Series.mode(x).get(0, nan),
{ContinuousVariable, DiscreteVariable, TimeVariable}
),
"Standard deviation": Aggregation("std", {ContinuousVariable, TimeVariable}),
"Variance": Aggregation("var", {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable, TimeVariable}),
"Standard deviation": Aggregation(std, {ContinuousVariable, TimeVariable}),
"Variance": Aggregation(var, {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable}),
"Concatenate": Aggregation(
concatenate,
{ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable},
),
"Min. value": Aggregation("min", {ContinuousVariable, TimeVariable}),
"Max. value": Aggregation("max", {ContinuousVariable, TimeVariable}),
"Span": Aggregation(
lambda x: pd.Series.max(x) - pd.Series.min(x),
{ContinuousVariable, TimeVariable},
),
"Span": Aggregation(span, {ContinuousVariable, TimeVariable}),
"First value": Aggregation(
"first", {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable}
),
Expand Down Expand Up @@ -506,6 +540,17 @@ def __aggregation_compatible(agg, attr):
"""Check a compatibility of aggregation with the variable"""
return type(attr) in AGGREGATIONS[agg].types

@classmethod
def migrate_context(cls, context, _):
"""
Before widget allowed using Sum on Time variable, now it is forbidden.
This function removes Sum from the context for TimeVariables (104)
"""
for var_, v in context.values["aggregations"][0].items():
if len(var_) == 2:
if var_[1] == 104:
v.discard("Sum")


if __name__ == "__main__":
# pylint: disable=ungrouped-imports
Expand Down
1 change: 1 addition & 0 deletions Orange/widgets/data/tests/test_oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,7 @@ def test_raise_pandas_version(self):
When this test start to fail:
- remove this test
- remove if clause in datetime_to_epoch function and supporting comments
- remove same if clause in var function in owgroupby (line 77, 78)
- set pandas dependency version to pandas>=1.4
"""
from datetime import datetime
Expand Down
226 changes: 223 additions & 3 deletions Orange/widgets/data/tests/test_owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
table_to_frame,
Domain,
ContinuousVariable,
DiscreteVariable,
TimeVariable,
StringVariable,
)
from Orange.data.tests.test_aggregate import create_sample_data
from Orange.widgets.data.owgroupby import OWGroupBy
Expand Down Expand Up @@ -665,6 +668,28 @@ def test_context(self):
self.widget.aggregations,
)

def test_context_time_variable(self):
"""
Test migrate_context which removes sum for TimeVariable since
GroupBy does not support it anymore for TimeVariable
"""
tv = TimeVariable("T", have_time=True, have_date=True)
data = Table.from_numpy(
Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
np.array([[0.0, 0.0], [0, 10], [0, 20], [1, 500], [1, 1000]]),
)
self.send_signal(self.widget.Inputs.data, data)
self.widget.aggregations[tv].add("Sum")
self.widget.aggregations[tv].add("Median")
self.send_signal(self.widget.Inputs.data, self.iris)

widget = self.create_widget(
OWGroupBy,
stored_settings=self.widget.settingsHandler.pack_data(self.widget),
)
self.send_signal(widget.Inputs.data, data, widget=widget)
self.assertSetEqual(widget.aggregations[tv], {"Mean", "Median"})

@patch(
"Orange.data.aggregate.OrangeTableGroupBy.aggregate",
Mock(side_effect=ValueError("Test unexpected err")),
Expand All @@ -690,16 +715,211 @@ def test_time_variable(self):

# time variable as a group by variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [1])
self._set_selection(self.widget.gb_attrs_view, [3])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(3, len(output))

# time variable as a grouped variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [5])
attributes = [data.domain["c2"], data.domain["d2"]]
self.send_signal(self.widget.Inputs.data, data[:, attributes])
self._set_selection(self.widget.gb_attrs_view, [1]) # d2
# check all aggregations
self.assert_aggregations_equal(["Mean", "Mode"])
self.select_table_rows(self.widget.agg_table_view, [0]) # c2
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["Mean, Median, Mode and 12 more", "Mode"])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(2, len(output))

def test_time_variable_results(self):
data = Table.from_numpy(
Domain(
[
DiscreteVariable("G", values=["G1", "G2", "G3"]),
TimeVariable("T", have_time=True, have_date=True),
]
),
np.array([[0.0, 0], [0, 10], [0, 20], [1, 500], [1, 1000], [2, 1]]),
)
self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": [
"1970-01-01 00:00:10",
"1970-01-01 00:12:30",
"1970-01-01 00:00:01",
],
"T - Median": [
"1970-01-01 00:00:10",
"1970-01-01 00:12:30",
"1970-01-01 00:00:01",
],
"T - Mode": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Standard deviation": [10, 353.5533905932738, np.nan],
"T - Variance": [100, 125000, np.nan],
"T - Min. value": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Max. value": [
"1970-01-01 00:00:20",
"1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"T - Span": [20, 500, 0],
"T - First value": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Last value": [
"1970-01-01 00:00:20",
"1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"T - Count defined": [3, 2, 1],
"T - Count": [3, 2, 1],
"T - Proportion defined": [1, 1, 1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
"1970-01-01 00:08:20 1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"G": ["G1", "G2", "G3"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output = self.get_output(self.widget.Outputs.data)
output_df = table_to_frame(output, include_metas=True)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)
expected_attributes = (
TimeVariable("T - Mean", have_date=1, have_time=1),
TimeVariable("T - Median", have_date=1, have_time=1),
TimeVariable("T - Mode", have_date=1, have_time=1),
ContinuousVariable(name="T - Standard deviation"),
ContinuousVariable(name="T - Variance"),
TimeVariable("T - Min. value", have_date=1, have_time=1),
TimeVariable("T - Max. value", have_date=1, have_time=1),
ContinuousVariable(name="T - Span"),
TimeVariable("T - First value", have_date=1, have_time=1),
TimeVariable("T - Last value", have_date=1, have_time=1),
TimeVariable("T - Random value", have_date=1, have_time=1),
ContinuousVariable(name="T - Count defined"),
ContinuousVariable(name="T - Count"),
ContinuousVariable(name="T - Proportion defined"),
)
expected_metas = (
StringVariable(name="T - Concatenate"),
DiscreteVariable(name="G", values=("G1", "G2", "G3")),
)
self.assertTupleEqual(output.domain.attributes, expected_attributes)
self.assertTupleEqual(output.domain.metas, expected_metas)

def test_tz_time_variable_results(self):
""" Test results in case of timezoned time variable"""
tv = TimeVariable("T", have_time=True, have_date=True)
data = Table.from_numpy(
Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
np.array([[0.0, tv.parse("1970-01-01 01:00:00+01:00")],
[0, tv.parse("1970-01-01 01:00:10+01:00")],
[0, tv.parse("1970-01-01 01:00:20+01:00")]]),
)

self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": ["1970-01-01 00:00:10"],
"T - Median": ["1970-01-01 00:00:10"],
"T - Mode": ["1970-01-01 00:00:00"],
"T - Standard deviation": [10],
"T - Variance": [100],
"T - Min. value": ["1970-01-01 00:00:00"],
"T - Max. value": ["1970-01-01 00:00:20"],
"T - Span": [20, ],
"T - First value": ["1970-01-01 00:00:00"],
"T - Last value": ["1970-01-01 00:00:20"],
"T - Count defined": [3],
"T - Count": [3],
"T - Proportion defined": [1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
],
"G": ["G1"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output_df = table_to_frame(
self.get_output(self.widget.Outputs.data), include_metas=True
)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)

def test_only_nan_in_group(self):
data = Table(
Domain([ContinuousVariable("A"), ContinuousVariable("B")]),
Expand Down

0 comments on commit 2400565

Please sign in to comment.