-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5541 from PrimozGodec/aggregate
[ENH] New widget: Group By
- Loading branch information
Showing
12 changed files
with
1,713 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,4 @@ | |
from .io import * | ||
from .filter import * | ||
from .pandas_compat import * | ||
from .aggregate import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from functools import lru_cache | ||
from typing import Callable, Dict, List, Tuple, Union | ||
|
||
import pandas as pd | ||
|
||
from Orange.data import Domain, Table, Variable, table_from_frame, table_to_frame | ||
from Orange.util import dummy_callback | ||
|
||
|
||
class OrangeTableGroupBy: | ||
""" | ||
A class representing the result of the groupby operation on Orange's | ||
Table and offers aggregation functionality on groupby object. It wraps | ||
Panda's GroupBy object. | ||
Attributes | ||
---------- | ||
table | ||
Table to be grouped | ||
by | ||
Variable used for grouping. Resulting groups are defined with unique | ||
combinations of those values. | ||
Examples | ||
-------- | ||
from Orange.data import Table | ||
table = Table("iris") | ||
gb = table.groupby([table.domain["iris"]]) | ||
aggregated_table = gb.aggregate( | ||
{table.domain["sepal length"]: ["mean", "median"], | ||
table.domain["petal length"]: ["mean"]} | ||
) | ||
""" | ||
|
||
def __init__(self, table: Table, by: List[Variable]): | ||
self.table = table | ||
|
||
df = table_to_frame(table, include_metas=True) | ||
# observed=True keeps only groups with at leas one instance | ||
self.group_by = df.groupby([a.name for a in by], observed=True) | ||
|
||
# lru_cache that is caches on the object level | ||
self.compute_aggregation = lru_cache()(self._compute_aggregation) | ||
|
||
def aggregate( | ||
self, | ||
aggregations: Dict[ | ||
Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]] | ||
], | ||
callback: Callable = dummy_callback, | ||
) -> Table: | ||
""" | ||
Compute aggregations for each group | ||
Parameters | ||
---------- | ||
aggregations | ||
The dictionary that defines aggregations that need to be computed | ||
for variables. We support two formats: | ||
- {variable name: [agg function 1, agg function 2]} | ||
- {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]} | ||
Where agg name is the aggregation name used in the output column name. | ||
Aggregation function can be either function or string that defines | ||
aggregation in Pandas (e.g. mean). | ||
callback | ||
Callback function to report the progress | ||
Returns | ||
------- | ||
Table that includes aggregation columns. Variables that are used for | ||
grouping are in metas. | ||
""" | ||
num_aggs = sum(len(aggs) for aggs in aggregations.values()) | ||
count = 0 | ||
|
||
result_agg = [] | ||
for col, aggs in aggregations.items(): | ||
for agg in aggs: | ||
res = self._compute_aggregation(col, agg) | ||
result_agg.append(res) | ||
count += 1 | ||
callback(count / num_aggs * 0.8) | ||
|
||
agg_table = self._aggregations_to_table(result_agg) | ||
callback(1) | ||
return agg_table | ||
|
||
def _compute_aggregation( | ||
self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]] | ||
) -> pd.Series: | ||
# use named aggregation to avoid issues with same column names when reset_index | ||
if isinstance(agg, tuple): | ||
name, agg = agg | ||
else: | ||
name = agg if isinstance(agg, str) else agg.__name__ | ||
col_name = f"{col.name} - {name}" | ||
return self.group_by[col.name].agg(**{col_name: agg}) | ||
|
||
def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table: | ||
"""Concatenate aggregation series and convert back to Table""" | ||
if aggregations: | ||
df = pd.concat(aggregations, axis=1) | ||
else: | ||
# when no aggregation is computed return a table with gropby columns | ||
df = self.group_by.first() | ||
df = df.drop(columns=df.columns) | ||
gb_attributes = df.index.names | ||
df = df.reset_index() # move group by var that are in index to columns | ||
table = table_from_frame(df) | ||
|
||
# group by variables should be last two columns in metas in the output | ||
metas = table.domain.metas | ||
new_metas = [m for m in metas if m.name not in gb_attributes] + [ | ||
table.domain[n] for n in gb_attributes | ||
] | ||
new_domain = Domain( | ||
[var for var in table.domain.attributes if var.name not in gb_attributes], | ||
metas=new_metas, | ||
) | ||
# keeps input table's type - e.g. output is Corpus if input Corpus | ||
return self.table.from_table(new_domain, table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import unittest | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from Orange.data import ( | ||
DiscreteVariable, | ||
ContinuousVariable, | ||
Domain, | ||
StringVariable, | ||
Table, | ||
table_to_frame, | ||
) | ||
|
||
|
||
def create_sample_data(): | ||
domain = Domain( | ||
[ | ||
ContinuousVariable("a"), | ||
ContinuousVariable("b"), | ||
ContinuousVariable("cvar"), | ||
DiscreteVariable("dvar", values=["val1", "val2"]), | ||
], | ||
metas=[StringVariable("svar")], | ||
) | ||
return Table.from_numpy( | ||
domain, | ||
np.array( | ||
[ | ||
[1, 1, 0.1, 0], | ||
[1, 1, 0.2, 1], | ||
[1, 2, np.nan, np.nan], | ||
[1, 2, 0.3, 1], | ||
[1, 3, 0.3, 0], | ||
[1, 3, 0.4, 1], | ||
[1, 3, 0.6, 0], | ||
[2, 1, 1.0, 1], | ||
[2, 1, 2.0, 0], | ||
[2, 2, 3.0, 1], | ||
[2, 2, -4.0, 0], | ||
[2, 3, 5.0, 1], | ||
[2, 3, 5.0, 0], | ||
] | ||
), | ||
metas=np.array( | ||
[ | ||
["sval1"], | ||
["sval2"], | ||
[""], | ||
["sval2"], | ||
["sval1"], | ||
["sval2"], | ||
["sval1"], | ||
["sval2"], | ||
["sval1"], | ||
["sval2"], | ||
["sval1"], | ||
["sval2"], | ||
["sval1"], | ||
] | ||
), | ||
) | ||
|
||
|
||
# pylint: disable=abstract-method | ||
class AlternativeTable(Table): | ||
pass | ||
|
||
|
||
class DomainTest(unittest.TestCase): | ||
def setUp(self) -> None: | ||
self.data = create_sample_data() | ||
|
||
def test_simple_aggregation(self): | ||
"""Test aggregation results""" | ||
d = self.data.domain | ||
gb = self.data.groupby([d["a"]]) | ||
output = gb.aggregate({d["a"]: ["mean"], d["b"]: ["mean"]}) | ||
|
||
np.testing.assert_array_almost_equal(output.X, [[1, 2.143], [2, 2]], decimal=3) | ||
np.testing.assert_array_almost_equal(output.metas, [[1], [2]], decimal=3) | ||
self.assertListEqual( | ||
["a - mean", "b - mean"], [d.name for d in output.domain.attributes] | ||
) | ||
self.assertListEqual(["a"], [d.name for d in output.domain.metas]) | ||
|
||
def test_aggregation(self): | ||
d = self.data.domain | ||
gb = self.data.groupby([self.data.domain["a"], self.data.domain["b"]]) | ||
output = gb.aggregate( | ||
{ | ||
d["cvar"]: [("Mean", "mean"), ("Median", "median"), ("Mean1", np.mean)], | ||
d["dvar"]: [("Count defined", "count"), ("Count", "size")], | ||
d["svar"]: [("Concatenate", "".join)], | ||
} | ||
) | ||
|
||
expected_columns = [ | ||
"cvar - Mean", | ||
"cvar - Median", | ||
"cvar - Mean1", | ||
"dvar - Count defined", | ||
"dvar - Count", | ||
"svar - Concatenate", | ||
"a", # groupby variables are last two in metas | ||
"b", | ||
] | ||
|
||
exp_df = pd.DataFrame( | ||
[ | ||
[0.15, 0.15, 0.15, 2, 2, "sval1sval2", 1, 1], | ||
[0.3, 0.3, 0.3, 1, 2, "sval2", 1, 2], | ||
[0.433, 0.4, 0.433, 3, 3, "sval1sval2sval1", 1, 3], | ||
[1.5, 1.5, 1.5, 2, 2, "sval2sval1", 2, 1], | ||
[-0.5, -0.5, -0.5, 2, 2, "sval2sval1", 2, 2], | ||
[5, 5, 5, 2, 2, "sval2sval1", 2, 3], | ||
], | ||
columns=expected_columns, | ||
) | ||
|
||
out_df = table_to_frame(output, include_metas=True) | ||
|
||
pd.testing.assert_frame_equal( | ||
out_df, | ||
exp_df, | ||
check_dtype=False, | ||
check_column_type=False, | ||
check_categorical=False, | ||
atol=1e-3, | ||
) | ||
|
||
def test_preserve_table_class(self): | ||
""" | ||
Test whether result table has the same type than the imnput table, | ||
e.g. if input table corpus the resutlitn table must be corpus too. | ||
""" | ||
data = AlternativeTable.from_table(self.data.domain, self.data) | ||
gb = data.groupby([data.domain["a"]]) | ||
output = gb.aggregate({data.domain["a"]: ["mean"]}) | ||
self.assertIsInstance(output, AlternativeTable) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
Oops, something went wrong.