skrub-data · jeromedockes · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -75,6 +75,7 @@ jobs:
       - shell: bash {0}
         run: |
           cp $GITHUB_WORKSPACE/pyproject.toml .
+          cp $GITHUB_WORKSPACE/conftest.py .
           $GITHUB_WORKSPACE/build_tools/github/test.sh
         working-directory: ${{ runner.temp }}
         name: 'Run tests'

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -17,7 +17,7 @@ Major changes
 
 * :class:`InterpolationJoiner` was added to join two tables by using
   machine-learning to infer the matching rows from the second table.
-  :pr:`742` by :user:`Jérôme Dockès <jeromedockes>`.
+  :pr:`742` and :pr:`827` by :user:`Jérôme Dockès <jeromedockes>`.
 
 * Pipelines including :class:`TableVectorizer` can now be grid-searched, since
   we can now call `set_params` on the default transformers of :class:`TableVectorizer`.

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,15 @@
+import pandas
+import pytest
+
+DATAFRAME_MODULES = [pandas]
+try:
+    import polars
+
+    DATAFRAME_MODULES.append(polars)
+except ImportError:
+    pass
+
+
+@pytest.fixture(params=DATAFRAME_MODULES)
+def px(request):
+    return request.param
diff --git a/setup.cfg b/setup.cfg
@@ -25,11 +25,12 @@ project_urls =
 include_package_data = True
 packages = find:
 install_requires =
-    scikit-learn>=1.2.1
+    scikit-learn>=1.3.0
     numpy>=1.23.5
     scipy>=1.9.3
-    pandas>=1.5.3
+    pandas>=2.1.0
     packaging>=23.1
+    dataframe-api-compat>=0.1.28
 python_requires = >=3.10
 
 [options.extras_require]
@@ -76,10 +77,10 @@ benchmarks =
 # Overwrite the previous install_requires for CI testing purposes
 # as defined in testing.yml.
 min-py310 =
-    scikit-learn==1.2.1
+    scikit-learn==1.3.0
     numpy==1.23.5
     scipy==1.9.3
-    pandas==1.5.3
+    pandas==2.1.0
 
 [flake8]
 # max line length for black

diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py
@@ -0,0 +1,4 @@
+from ._common import Selector, std, stdns
+from ._namespace import get_df_namespace, skrubns
+
+__all__ = ["get_df_namespace", "skrubns", "std", "stdns", "Selector"]
diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
@@ -0,0 +1,22 @@
+import enum
+
+
+class Selector(enum.Enum):
+    ALL = enum.auto()
+    NONE = enum.auto()
+    NUMERIC = enum.auto()
+    CATEGORICAL = enum.auto()
+
+
+def std(obj):
+    try:
+        return obj.__dataframe_consortium_standard__()
+    except AttributeError:
+        return obj.__column_consortium_standard__()
+
+
+def stdns(obj):
+    try:
+        return obj.__dataframe_consortium_standard__().__dataframe_namespace__()
+    except AttributeError:
+        return obj.__column_consortium_standard__().__column_namespace__()
diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py
@@ -97,3 +97,8 @@ def get_df_namespace(*dfs):
                 "Only Pandas or Polars dataframes are currently supported, "
                 f"got {modules=!r}."
             )
+
+
+def skrubns(*dataframes):
+    ns, _ = get_df_namespace(*dataframes)
+    return ns
diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
@@ -9,6 +9,22 @@
 
 from skrub._utils import atleast_1d_or_none
 
+from ._common import Selector
+
+__all__ = [
+    "make_dataframe",
+    "make_series",
+    "aggregate",
+    "join",
+    "split_num_categ_cols",
+    "select",
+    "drop",
+    "Selector",
+    "concat_horizontal",
+    "any_rowwise",
+    "to_pandas",
+]
+
 
 def make_dataframe(X, index=None):
     """Convert an dictionary of columns into a Pandas dataframe.
@@ -329,4 +345,34 @@
 
 
 def select(dataframe, columns):
-    return dataframe[columns]
+    if not isinstance(columns, Selector):
+        return dataframe[columns]
+    if columns is Selector.ALL:
+        return dataframe
+    elif columns is Selector.NONE:
+        return dataframe[[]]
+    elif columns is Selector.NUMERIC:
+        return dataframe.select_dtypes("number")
+    elif columns is Selector.CATEGORICAL:
+        return dataframe.select_dtypes(["object", "string", "category"])
+    # we have covered all items in the enumeration
+    assert False
+
+
+def drop(dataframe, columns):
+    return dataframe.drop(select(dataframe, columns).columns.values, axis=1)
+
+
+def any_rowwise(dataframe):
+    return dataframe.any(axis=1)
+
+
+def concat_horizontal(dataframe, *other_dataframes):
+    other_dataframes = [
+        df.set_axis(dataframe.index, axis="index") for df in other_dataframes
+    ]
+    return pd.concat([dataframe] + list(other_dataframes), axis=1)
+
+
+def to_pandas(dataframe):
+    return dataframe
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
@@ -13,6 +13,23 @@
 
 from skrub._utils import atleast_1d_or_none
 
+from ._common import Selector
+
+__all__ = [
+    "POLARS_SETUP",
+    "make_dataframe",
+    "make_series",
+    "aggregate",
+    "join",
+    "split_num_categ_cols",
+    "select",
+    "drop",
+    "Selector",
+    "concat_horizontal",
+    "any_rowwise",
+    "to_pandas",
+]
+
 
 def make_dataframe(X, index=None):
     """Convert an dictionary of columns into a Polars dataframe.
@@ -263,5 +280,47 @@
     return num_cols, categ_cols
 
 
+def _check_selector(columns):
+    if not isinstance(columns, Selector):
+        return columns
+    if columns is Selector.ALL:
+        return cs.all()
+    elif columns is Selector.NONE:
+        return []
+    elif columns is Selector.NUMERIC:
+        return cs.numeric()
+    elif columns is Selector.CATEGORICAL:
+        return cs.string(include_categorical=True)
+    # we have covered all items in the enumeration
+    assert False
+
+
 def select(dataframe, columns):
-    return dataframe.select(columns)
+    return dataframe.select(_check_selector(columns))
+
+
+def drop(dataframe, columns):
+    return dataframe.drop(_check_selector(columns))
+
+
+def any_rowwise(dataframe):
+    return _collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any")
+
+
+def concat_horizontal(dataframe, *other_dataframes):
+    return pl.concat(
+        [_collect(dataframe)] + [_collect(df) for df in other_dataframes],
+        how="horizontal",
+    )
+
+
+def _collect(dataframe):
+    if hasattr(dataframe, "collect"):
+        dataframe = dataframe.collect()
+    return dataframe
+
+
+def to_pandas(dataframe):
+    if hasattr(dataframe, "collect"):
+        dataframe = dataframe.collect()
+    return dataframe.to_pandas()
diff --git a/skrub/_dataframe/tests/test_all.py b/skrub/_dataframe/tests/test_all.py
@@ -0,0 +1,43 @@
+import pytest
+
+from skrub._dataframe import Selector, skrubns
+
+
+@pytest.fixture
+def df(px):
+    return px.DataFrame(
+        {"ID": [2, 3, 7], "name": ["ab", "cd", "01"], "temp": [20.3, 40.9, 11.5]}
+    )
+
+
+def test_select(df):
+    ns = skrubns(df)
+    assert list(ns.select(df, []).columns) == []
+    assert list(ns.select(df, ["name"]).columns) == ["name"]
+    assert list(ns.select(df, Selector.ALL).columns) == list(df.columns)
+    assert list(ns.select(df, Selector.NONE).columns) == []
+    assert list(ns.select(df, Selector.NUMERIC).columns) == ["ID", "temp"]
+    assert list(ns.select(df, Selector.CATEGORICAL).columns) == ["name"]
+
+
+def test_drop(df):
+    ns = skrubns(df)
+    assert list(ns.drop(df, []).columns) == list(df.columns)
+    assert list(ns.drop(df, ["name"]).columns) == ["ID", "temp"]
+    assert list(ns.drop(df, Selector.ALL).columns) == []
+    assert list(ns.drop(df, Selector.NONE).columns) == list(df.columns)
+    assert list(ns.drop(df, Selector.NUMERIC).columns) == ["name"]
+    assert list(ns.drop(df, Selector.CATEGORICAL).columns) == ["ID", "temp"]
+
+
+def test_concat_horizontal(df):
+    ns = skrubns(df)
+    df1 = (
+        df.__dataframe_consortium_standard__()
+        .rename_columns({c: f"{c}_1" for c in df.columns})
+        .dataframe
+    )
+    out = ns.concat_horizontal(df)
+    assert list(out.columns) == list(df.columns)
+    out = ns.concat_horizontal(df, df1)
+    assert list(out.columns) == list(df.columns) + list(df1.columns)
diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py
@@ -0,0 +1,11 @@
+from skrub._dataframe import skrubns, std, stdns
+
+
+def test_std(px):
+    df = px.DataFrame({"A": [1, 2]})
+    assert hasattr(std(df), "dataframe")
+    assert hasattr(stdns(df), "dataframe_from_columns")
+    ns = skrubns(df)
+    s = ns.make_series([1, 2], name="A")
+    assert hasattr(std(s), "column")
+    assert hasattr(stdns(s), "dataframe_from_columns")