Skip to content

Commit

Permalink
Merge pull request #5541 from PrimozGodec/aggregate
Browse files Browse the repository at this point in the history
[ENH] New widget: Group By
  • Loading branch information
janezd authored Oct 22, 2021
2 parents d8c952b + 02d205e commit 67e6a71
Show file tree
Hide file tree
Showing 12 changed files with 1,713 additions and 1 deletion.
1 change: 1 addition & 0 deletions Orange/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
from .io import *
from .filter import *
from .pandas_compat import *
from .aggregate import *
122 changes: 122 additions & 0 deletions Orange/data/aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from functools import lru_cache
from typing import Callable, Dict, List, Tuple, Union

import pandas as pd

from Orange.data import Domain, Table, Variable, table_from_frame, table_to_frame
from Orange.util import dummy_callback


class OrangeTableGroupBy:
"""
A class representing the result of the groupby operation on Orange's
Table and offers aggregation functionality on groupby object. It wraps
Panda's GroupBy object.
Attributes
----------
table
Table to be grouped
by
Variable used for grouping. Resulting groups are defined with unique
combinations of those values.
Examples
--------
from Orange.data import Table
table = Table("iris")
gb = table.groupby([table.domain["iris"]])
aggregated_table = gb.aggregate(
{table.domain["sepal length"]: ["mean", "median"],
table.domain["petal length"]: ["mean"]}
)
"""

def __init__(self, table: Table, by: List[Variable]):
self.table = table

df = table_to_frame(table, include_metas=True)
# observed=True keeps only groups with at leas one instance
self.group_by = df.groupby([a.name for a in by], observed=True)

# lru_cache that is caches on the object level
self.compute_aggregation = lru_cache()(self._compute_aggregation)

def aggregate(
self,
aggregations: Dict[
Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
],
callback: Callable = dummy_callback,
) -> Table:
"""
Compute aggregations for each group
Parameters
----------
aggregations
The dictionary that defines aggregations that need to be computed
for variables. We support two formats:
- {variable name: [agg function 1, agg function 2]}
- {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]}
Where agg name is the aggregation name used in the output column name.
Aggregation function can be either function or string that defines
aggregation in Pandas (e.g. mean).
callback
Callback function to report the progress
Returns
-------
Table that includes aggregation columns. Variables that are used for
grouping are in metas.
"""
num_aggs = sum(len(aggs) for aggs in aggregations.values())
count = 0

result_agg = []
for col, aggs in aggregations.items():
for agg in aggs:
res = self._compute_aggregation(col, agg)
result_agg.append(res)
count += 1
callback(count / num_aggs * 0.8)

agg_table = self._aggregations_to_table(result_agg)
callback(1)
return agg_table

def _compute_aggregation(
self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
) -> pd.Series:
# use named aggregation to avoid issues with same column names when reset_index
if isinstance(agg, tuple):
name, agg = agg
else:
name = agg if isinstance(agg, str) else agg.__name__
col_name = f"{col.name} - {name}"
return self.group_by[col.name].agg(**{col_name: agg})

def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
"""Concatenate aggregation series and convert back to Table"""
if aggregations:
df = pd.concat(aggregations, axis=1)
else:
# when no aggregation is computed return a table with gropby columns
df = self.group_by.first()
df = df.drop(columns=df.columns)
gb_attributes = df.index.names
df = df.reset_index() # move group by var that are in index to columns
table = table_from_frame(df)

# group by variables should be last two columns in metas in the output
metas = table.domain.metas
new_metas = [m for m in metas if m.name not in gb_attributes] + [
table.domain[n] for n in gb_attributes
]
new_domain = Domain(
[var for var in table.domain.attributes if var.name not in gb_attributes],
metas=new_metas,
)
# keeps input table's type - e.g. output is Corpus if input Corpus
return self.table.from_table(new_domain, table)
21 changes: 21 additions & 0 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from itertools import chain
from numbers import Real, Integral
from threading import Lock
from typing import List, TYPE_CHECKING

import bottleneck as bn
import numpy as np
Expand All @@ -32,6 +33,9 @@
stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
sparse_implicit_zero_weights
from Orange.util import OrangeDeprecationWarning, dummy_callback
if TYPE_CHECKING:
# import just for type checking - avoid circular import
from Orange.data.aggregate import OrangeTableGroupBy

__all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]

Expand Down Expand Up @@ -2227,6 +2231,23 @@ def densify(features):
t.ids = self.ids # preserve indices
return t

def groupby(self, columns: List[Variable]) -> "OrangeTableGroupBy":
"""
Group Table by variables defined in the columns list. Behaviour is
similar to Pandas groupby.
Parameters
----------
columns
List of variables used to determine the groups
Returns
-------
GroupBy object of type OrangeTableGroupBy which holds information about
groups.
"""
return Orange.data.aggregate.OrangeTableGroupBy(self, columns)


def _dereferenced(array):
# CSR and CSC matrices are constructed so that array.data is a
Expand Down
144 changes: 144 additions & 0 deletions Orange/data/tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import unittest

import numpy as np
import pandas as pd

from Orange.data import (
DiscreteVariable,
ContinuousVariable,
Domain,
StringVariable,
Table,
table_to_frame,
)


def create_sample_data():
domain = Domain(
[
ContinuousVariable("a"),
ContinuousVariable("b"),
ContinuousVariable("cvar"),
DiscreteVariable("dvar", values=["val1", "val2"]),
],
metas=[StringVariable("svar")],
)
return Table.from_numpy(
domain,
np.array(
[
[1, 1, 0.1, 0],
[1, 1, 0.2, 1],
[1, 2, np.nan, np.nan],
[1, 2, 0.3, 1],
[1, 3, 0.3, 0],
[1, 3, 0.4, 1],
[1, 3, 0.6, 0],
[2, 1, 1.0, 1],
[2, 1, 2.0, 0],
[2, 2, 3.0, 1],
[2, 2, -4.0, 0],
[2, 3, 5.0, 1],
[2, 3, 5.0, 0],
]
),
metas=np.array(
[
["sval1"],
["sval2"],
[""],
["sval2"],
["sval1"],
["sval2"],
["sval1"],
["sval2"],
["sval1"],
["sval2"],
["sval1"],
["sval2"],
["sval1"],
]
),
)


# pylint: disable=abstract-method
class AlternativeTable(Table):
pass


class DomainTest(unittest.TestCase):
def setUp(self) -> None:
self.data = create_sample_data()

def test_simple_aggregation(self):
"""Test aggregation results"""
d = self.data.domain
gb = self.data.groupby([d["a"]])
output = gb.aggregate({d["a"]: ["mean"], d["b"]: ["mean"]})

np.testing.assert_array_almost_equal(output.X, [[1, 2.143], [2, 2]], decimal=3)
np.testing.assert_array_almost_equal(output.metas, [[1], [2]], decimal=3)
self.assertListEqual(
["a - mean", "b - mean"], [d.name for d in output.domain.attributes]
)
self.assertListEqual(["a"], [d.name for d in output.domain.metas])

def test_aggregation(self):
d = self.data.domain
gb = self.data.groupby([self.data.domain["a"], self.data.domain["b"]])
output = gb.aggregate(
{
d["cvar"]: [("Mean", "mean"), ("Median", "median"), ("Mean1", np.mean)],
d["dvar"]: [("Count defined", "count"), ("Count", "size")],
d["svar"]: [("Concatenate", "".join)],
}
)

expected_columns = [
"cvar - Mean",
"cvar - Median",
"cvar - Mean1",
"dvar - Count defined",
"dvar - Count",
"svar - Concatenate",
"a", # groupby variables are last two in metas
"b",
]

exp_df = pd.DataFrame(
[
[0.15, 0.15, 0.15, 2, 2, "sval1sval2", 1, 1],
[0.3, 0.3, 0.3, 1, 2, "sval2", 1, 2],
[0.433, 0.4, 0.433, 3, 3, "sval1sval2sval1", 1, 3],
[1.5, 1.5, 1.5, 2, 2, "sval2sval1", 2, 1],
[-0.5, -0.5, -0.5, 2, 2, "sval2sval1", 2, 2],
[5, 5, 5, 2, 2, "sval2sval1", 2, 3],
],
columns=expected_columns,
)

out_df = table_to_frame(output, include_metas=True)

pd.testing.assert_frame_equal(
out_df,
exp_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)

def test_preserve_table_class(self):
"""
Test whether result table has the same type than the imnput table,
e.g. if input table corpus the resutlitn table must be corpus too.
"""
data = AlternativeTable.from_table(self.data.domain, self.data)
gb = data.groupby([data.domain["a"]])
output = gb.aggregate({data.domain["a"]: ["mean"]})
self.assertIsInstance(output, AlternativeTable)


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 67e6a71

Please sign in to comment.