Merge pull request #5541 from PrimozGodec/aggregate

[ENH] New widget: Group By
biolab · Oct 22, 2021 · 67e6a71 · 67e6a71
2 parents d8c952b + 02d205e
commit 67e6a71
Show file tree

Hide file tree

Showing 12 changed files with 1,713 additions and 1 deletion.
diff --git a/Orange/data/__init__.py b/Orange/data/__init__.py
@@ -10,3 +10,4 @@
 from .io import *
 from .filter import *
 from .pandas_compat import *
+from .aggregate import *
diff --git a/Orange/data/aggregate.py b/Orange/data/aggregate.py
@@ -0,0 +1,122 @@
+from functools import lru_cache
+from typing import Callable, Dict, List, Tuple, Union
+
+import pandas as pd
+
+from Orange.data import Domain, Table, Variable, table_from_frame, table_to_frame
+from Orange.util import dummy_callback
+
+
+class OrangeTableGroupBy:
+    """
+    A class representing the result of the groupby operation on Orange's
+    Table and offers aggregation functionality on groupby object. It wraps
+    Panda's GroupBy object.
+
+    Attributes
+    ----------
+    table
+        Table to be grouped
+    by
+        Variable used for grouping. Resulting groups are defined with unique
+        combinations of those values.
+
+    Examples
+    --------
+    from Orange.data import Table
+
+    table = Table("iris")
+    gb = table.groupby([table.domain["iris"]])
+    aggregated_table = gb.aggregate(
+        {table.domain["sepal length"]: ["mean", "median"],
+         table.domain["petal length"]: ["mean"]}
+    )
+    """
+
+    def __init__(self, table: Table, by: List[Variable]):
+        self.table = table
+
+        df = table_to_frame(table, include_metas=True)
+        # observed=True keeps only groups with at leas one instance
+        self.group_by = df.groupby([a.name for a in by], observed=True)
+
+        # lru_cache that is caches on the object level
+        self.compute_aggregation = lru_cache()(self._compute_aggregation)
+
+    def aggregate(
+        self,
+        aggregations: Dict[
+            Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
+        ],
+        callback: Callable = dummy_callback,
+    ) -> Table:
+        """
+        Compute aggregations for each group
+
+        Parameters
+        ----------
+        aggregations
+            The dictionary that defines aggregations that need to be computed
+            for variables. We support two formats:
+            - {variable name: [agg function 1, agg function 2]}
+            - {variable name: [(agg name 1, agg function 1),  (agg name 1, agg function 1)]}
+            Where agg name is the aggregation name used in the output column name.
+            Aggregation function can be either function or string that defines
+            aggregation in Pandas (e.g. mean).
+        callback
+            Callback function to report the progress
+
+        Returns
+        -------
+        Table that includes aggregation columns. Variables that are used for
+        grouping are in metas.
+        """
+        num_aggs = sum(len(aggs) for aggs in aggregations.values())
+        count = 0
+
+        result_agg = []
+        for col, aggs in aggregations.items():
+            for agg in aggs:
+                res = self._compute_aggregation(col, agg)
+                result_agg.append(res)
+                count += 1
+                callback(count / num_aggs * 0.8)
+
+        agg_table = self._aggregations_to_table(result_agg)
+        callback(1)
+        return agg_table
+
+    def _compute_aggregation(
+        self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
+    ) -> pd.Series:
+        # use named aggregation to avoid issues with same column names when reset_index
+        if isinstance(agg, tuple):
+            name, agg = agg
+        else:
+            name = agg if isinstance(agg, str) else agg.__name__
+        col_name = f"{col.name} - {name}"
+        return self.group_by[col.name].agg(**{col_name: agg})
+
+    def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
+        """Concatenate aggregation series and convert back to Table"""
+        if aggregations:
+            df = pd.concat(aggregations, axis=1)
+        else:
+            # when no aggregation is computed return a table with gropby columns
+            df = self.group_by.first()
+            df = df.drop(columns=df.columns)
+        gb_attributes = df.index.names
+        df = df.reset_index()  # move group by var that are in index to columns
+        table = table_from_frame(df)
+
+        # group by variables should be last two columns in metas in the output
+        metas = table.domain.metas
+        new_metas = [m for m in metas if m.name not in gb_attributes] + [
+            table.domain[n] for n in gb_attributes
+        ]
+        new_domain = Domain(
+            [var for var in table.domain.attributes if var.name not in gb_attributes],
+            metas=new_metas,
+        )
+        # keeps input table's type - e.g. output is Corpus if input Corpus
+        return self.table.from_table(new_domain, table)
diff --git a/Orange/data/table.py b/Orange/data/table.py
@@ -11,6 +11,7 @@
 from itertools import chain
 from numbers import Real, Integral
 from threading import Lock
+from typing import List, TYPE_CHECKING
 
 import bottleneck as bn
 import numpy as np
@@ -32,6 +33,9 @@
     stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
     sparse_implicit_zero_weights
 from Orange.util import OrangeDeprecationWarning, dummy_callback
+if TYPE_CHECKING:
+    # import just for type checking - avoid circular import
+    from Orange.data.aggregate import OrangeTableGroupBy
 
 __all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]
 
@@ -2227,6 +2231,23 @@ def densify(features):
         t.ids = self.ids  # preserve indices
         return t
 
+    def groupby(self, columns: List[Variable]) -> "OrangeTableGroupBy":
+        """
+        Group Table by variables defined in the columns list. Behaviour is
+        similar to Pandas groupby.
+
+        Parameters
+        ----------
+        columns
+            List of variables used to determine the groups
+
+        Returns
+        -------
+        GroupBy object of type OrangeTableGroupBy which holds information about
+        groups.
+        """
+        return Orange.data.aggregate.OrangeTableGroupBy(self, columns)
+
 
 def _dereferenced(array):
     # CSR and CSC matrices are constructed so that array.data is a

diff --git a/Orange/data/tests/test_aggregate.py b/Orange/data/tests/test_aggregate.py
@@ -0,0 +1,144 @@
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from Orange.data import (
+    DiscreteVariable,
+    ContinuousVariable,
+    Domain,
+    StringVariable,
+    Table,
+    table_to_frame,
+)
+
+
+def create_sample_data():
+    domain = Domain(
+        [
+            ContinuousVariable("a"),
+            ContinuousVariable("b"),
+            ContinuousVariable("cvar"),
+            DiscreteVariable("dvar", values=["val1", "val2"]),
+        ],
+        metas=[StringVariable("svar")],
+    )
+    return Table.from_numpy(
+        domain,
+        np.array(
+            [
+                [1, 1, 0.1, 0],
+                [1, 1, 0.2, 1],
+                [1, 2, np.nan, np.nan],
+                [1, 2, 0.3, 1],
+                [1, 3, 0.3, 0],
+                [1, 3, 0.4, 1],
+                [1, 3, 0.6, 0],
+                [2, 1, 1.0, 1],
+                [2, 1, 2.0, 0],
+                [2, 2, 3.0, 1],
+                [2, 2, -4.0, 0],
+                [2, 3, 5.0, 1],
+                [2, 3, 5.0, 0],
+            ]
+        ),
+        metas=np.array(
+            [
+                ["sval1"],
+                ["sval2"],
+                [""],
+                ["sval2"],
+                ["sval1"],
+                ["sval2"],
+                ["sval1"],
+                ["sval2"],
+                ["sval1"],
+                ["sval2"],
+                ["sval1"],
+                ["sval2"],
+                ["sval1"],
+            ]
+        ),
+    )
+
+
+# pylint: disable=abstract-method
+class AlternativeTable(Table):
+    pass
+
+
+class DomainTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.data = create_sample_data()
+
+    def test_simple_aggregation(self):
+        """Test aggregation results"""
+        d = self.data.domain
+        gb = self.data.groupby([d["a"]])
+        output = gb.aggregate({d["a"]: ["mean"], d["b"]: ["mean"]})
+
+        np.testing.assert_array_almost_equal(output.X, [[1, 2.143], [2, 2]], decimal=3)
+        np.testing.assert_array_almost_equal(output.metas, [[1], [2]], decimal=3)
+        self.assertListEqual(
+            ["a - mean", "b - mean"], [d.name for d in output.domain.attributes]
+        )
+        self.assertListEqual(["a"], [d.name for d in output.domain.metas])
+
+    def test_aggregation(self):
+        d = self.data.domain
+        gb = self.data.groupby([self.data.domain["a"], self.data.domain["b"]])
+        output = gb.aggregate(
+            {
+                d["cvar"]: [("Mean", "mean"), ("Median", "median"), ("Mean1", np.mean)],
+                d["dvar"]: [("Count defined", "count"), ("Count", "size")],
+                d["svar"]: [("Concatenate", "".join)],
+            }
+        )
+
+        expected_columns = [
+            "cvar - Mean",
+            "cvar - Median",
+            "cvar - Mean1",
+            "dvar - Count defined",
+            "dvar - Count",
+            "svar - Concatenate",
+            "a",  # groupby variables are last two in metas
+            "b",
+        ]
+
+        exp_df = pd.DataFrame(
+            [
+                [0.15, 0.15, 0.15, 2, 2, "sval1sval2", 1, 1],
+                [0.3, 0.3, 0.3, 1, 2, "sval2", 1, 2],
+                [0.433, 0.4, 0.433, 3, 3, "sval1sval2sval1", 1, 3],
+                [1.5, 1.5, 1.5, 2, 2, "sval2sval1", 2, 1],
+                [-0.5, -0.5, -0.5, 2, 2, "sval2sval1", 2, 2],
+                [5, 5, 5, 2, 2, "sval2sval1", 2, 3],
+            ],
+            columns=expected_columns,
+        )
+
+        out_df = table_to_frame(output, include_metas=True)
+
+        pd.testing.assert_frame_equal(
+            out_df,
+            exp_df,
+            check_dtype=False,
+            check_column_type=False,
+            check_categorical=False,
+            atol=1e-3,
+        )
+
+    def test_preserve_table_class(self):
+        """
+        Test whether result table has the same type than the imnput table,
+        e.g. if input table corpus the resutlitn table must be corpus too.
+        """
+        data = AlternativeTable.from_table(self.data.domain, self.data)
+        gb = data.groupby([data.domain["a"]])
+        output = gb.aggregate({data.domain["a"]: ["mean"]})
+        self.assertIsInstance(output, AlternativeTable)
+
+
+if __name__ == "__main__":
+    unittest.main()