From 912d545921b69d61bb0a3355177fcda0935bad01 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 14:05:26 +0100
Subject: [PATCH 01/17] start implementing interpolation join with dataframe
 api

---
 skrub/_dataframe/__init__.py   |   3 +
 skrub/_dataframe/_common.py    |   9 +++
 skrub/_dataframe/_pandas.py    |  60 ++++++++++++++++++-
 skrub/_dataframe/_polars.py    |  71 ++++++++++++++++++++++-
 skrub/_interpolation_joiner.py | 102 ++++++++++++++++++++++-----------
 skrub/_join_utils.py           |   3 +-
 6 files changed, 211 insertions(+), 37 deletions(-)
 create mode 100644 skrub/_dataframe/_common.py

diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py
index e69de29bb..6dec0a176 100644
--- a/skrub/_dataframe/__init__.py
+++ b/skrub/_dataframe/__init__.py
@@ -0,0 +1,3 @@
+from ._namespace import get_df_namespace
+
+__all__ = ["get_df_namespace"]
diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
new file mode 100644
index 000000000..8d7c6a662
--- /dev/null
+++ b/skrub/_dataframe/_common.py
@@ -0,0 +1,9 @@
+import enum
+
+
+class Selector(enum.Enum):
+    ALL = enum.auto()
+    NONE = enum.auto()
+    NUMERIC = enum.auto()
+    CATEGORICAL = enum.auto()
+    STRING = enum.auto()
diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
index 933e8e5b1..499bdd446 100644
--- a/skrub/_dataframe/_pandas.py
+++ b/skrub/_dataframe/_pandas.py
@@ -9,6 +9,24 @@
 
 from skrub._utils import atleast_1d_or_none
 
+from ._common import Selector
+
+__all__ = [
+    "make_dataframe",
+    "make_series",
+    "aggregate",
+    "join",
+    "split_num_categ_cols",
+    "select",
+    "drop",
+    "Selector",
+    "concatenate",
+    "any_rowwise",
+    "collect",
+    "to_numpy",
+    "to_pandas",
+]
+
 
 def make_dataframe(X, index=None):
     """Convert an dictionary of columns into a Pandas dataframe.
@@ -329,4 +347,44 @@ def split_num_categ_cols(table):
 
 
 def select(dataframe, columns):
-    return dataframe[columns]
+    if not isinstance(columns, Selector):
+        return dataframe[columns]
+    if columns is Selector.ALL:
+        return dataframe
+    elif columns is Selector.NONE:
+        return dataframe[[]]
+    elif columns is Selector.NUMERIC:
+        return dataframe.select_dtypes("number")
+    elif columns is Selector.CATEGORICAL:
+        return dataframe.select_dtypes(["object", "string", "category"])
+    elif columns is Selector.STRING:
+        return dataframe.select_dtypes(["string"])
+    # we have covered all items in the enumeration
+    assert False
+
+
+def drop(dataframe, columns):
+    return dataframe.drop(select(dataframe, columns).columns.values)
+
+
+def any_rowwise(dataframe):
+    return dataframe.any(axis=1)
+
+
+def concatenate(dataframe, *other_dataframes):
+    other_dataframes = [
+        df.set_axis(dataframe.index, axis="index") for df in other_dataframes
+    ]
+    return pd.concat([dataframe] + list(other_dataframes), axis=1)
+
+
+def collect(dataframe):
+    return dataframe
+
+
+def to_pandas(dataframe):
+    return dataframe
+
+
+def to_numpy(dataframe):
+    return dataframe.to_numpy()
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
index 714973ede..d17af0dd4 100644
--- a/skrub/_dataframe/_polars.py
+++ b/skrub/_dataframe/_polars.py
@@ -13,6 +13,25 @@
 
 from skrub._utils import atleast_1d_or_none
 
+from ._common import Selector
+
+__all__ = [
+    "POLARS_SETUP",
+    "make_dataframe",
+    "make_series",
+    "aggregate",
+    "join",
+    "split_num_categ_cols",
+    "select",
+    "drop",
+    "Selector",
+    "concatenate",
+    "any_rowwise",
+    "collect",
+    "to_numpy",
+    "to_pandas",
+]
+
 
 def make_dataframe(X, index=None):
     """Convert an dictionary of columns into a Polars dataframe.
@@ -263,5 +282,55 @@ def split_num_categ_cols(table):
     return num_cols, categ_cols
 
 
+def _check_selector(columns):
+    if not isinstance(columns, Selector):
+        return columns
+    if columns is Selector.ALL:
+        return cs.all()
+    elif columns is Selector.NONE:
+        return []
+    elif columns is Selector.NUMERIC:
+        return cs.numeric()
+    elif columns is Selector.CATEGORICAL:
+        return cs.string(include_categorical=True)
+    elif columns is Selector.STRING:
+        return cs.string()
+    # we have covered all items in the enumeration
+    assert False
+
+
 def select(dataframe, columns):
-    return dataframe.select(columns)
+    return dataframe.select(_check_selector(columns))
+
+
+def drop(dataframe, columns):
+    return dataframe.drop(_check_selector(columns))
+
+
+def any_rowwise(dataframe):
+    return collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any")
+
+
+def concatenate(dataframe, *other_dataframes):
+    return pl.concat(
+        [collect(dataframe)] + [collect(df) for df in other_dataframes],
+        how="horizontal",
+    )
+
+
+def collect(dataframe):
+    if hasattr(dataframe, "collect"):
+        dataframe = dataframe.collect()
+    return dataframe
+
+
+def to_pandas(dataframe):
+    if hasattr(dataframe, "collect"):
+        dataframe = dataframe.collect()
+    return dataframe.to_pandas()
+
+
+def to_numpy(dataframe):
+    if hasattr(dataframe, "to_numpy"):
+        return dataframe.to_numpy()
+    return dataframe.collect().to_numpy()
diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index 92e9bf30b..8d5862f3a 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -2,7 +2,6 @@
 
 import joblib
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.ensemble import (
     HistGradientBoostingClassifier,
@@ -11,6 +10,7 @@
 from sklearn.utils._tags import _safe_tags
 
 from skrub import _join_utils, _utils
+from skrub._dataframe import get_df_namespace
 from skrub._minhash_encoder import MinHashEncoder
 from skrub._table_vectorizer import TableVectorizer
 
@@ -221,12 +221,16 @@ def fit(self, X, y=None):
         self._check_inputs()
         if X is not None:
             _join_utils.check_missing_columns(X, self._main_key, "'X' (the main table)")
-        key_values = self.vectorizer_.fit_transform(self.aux_table[self._aux_key])
+        aux_table = self.aux_table.__dataframe_consortium_standard__()
+        # TODO avoid conversion to pandas when TableVectorizer supports it
+        ns, _ = get_df_namespace(self.aux_table)
+        df = ns.to_pandas(aux_table.select(*self._aux_key).dataframe)
+        key_values = self.vectorizer_.fit_transform(df)
         estimators = self._get_estimator_assignments()
         fit_results = joblib.Parallel(self.n_jobs)(
             joblib.delayed(_fit)(
                 key_values,
-                self.aux_table[assignment["columns"]],
+                aux_table.select(*assignment["columns"]).dataframe,
                 assignment["estimator"],
                 propagate_exceptions=(self.on_estimator_failure == "raise"),
             )
@@ -281,45 +285,52 @@ def transform(self, X):
             The result of the join between `X` and inferred rows from
             ``self.aux_table``.
         """
-        main_table = X
+        main_table = X.__dataframe_consortium_standard__()
         _join_utils.check_missing_columns(
-            main_table, self._main_key, "'X' (the main table)"
+            main_table.dataframe, self._main_key, "'X' (the main table)"
         )
-        key_values = self.vectorizer_.transform(
-            main_table[self._main_key].set_axis(self._aux_key, axis="columns")
+        df = (
+            main_table.select(*self._main_key)
+            .rename_columns(dict(zip(main_table.column_names, self._aux_key)))
+            .dataframe
         )
+        # TODO avoid conversion to pandas when vectorizer supports it
+        ns, _ = get_df_namespace(df)
+        df = ns.to_pandas(df)
+        key_values = self.vectorizer_.transform(df)
         prediction_results = joblib.Parallel(self.n_jobs)(
             joblib.delayed(_predict)(
                 key_values,
                 assignment["columns"],
                 assignment["estimator"],
                 propagate_exceptions=(self.on_estimator_failure == "raise"),
+                ns=get_df_namespace(self.aux_table)[0],
             )
             for assignment in self.estimators_
         )
         prediction_results = self._check_prediction_results(prediction_results)
         predictions = [res["predictions"] for res in prediction_results]
         predictions = _add_column_name_suffix(predictions, self.suffix)
-        for part in predictions:
-            part.index = main_table.index
-        return pd.concat([main_table] + predictions, axis=1)
+        ns, _ = get_df_namespace(self.aux_table)
+        return ns.concatenate(main_table.dataframe, *predictions)
 
     def _check_prediction_results(self, results):
         checked_results = []
         failed_columns = []
+        ns, _ = get_df_namespace(self.aux_table)
         for res in results:
             new_res = dict(**res)
             if res["failed"]:
-                if set(res["columns"]).issubset(
-                    self.aux_table.select_dtypes("number").columns.values
-                ):
-                    dtype = float
-                else:
-                    dtype = object
-                pred = pd.DataFrame(
-                    columns=res["columns"],
-                    index=np.arange(res["shape"][0]),
-                    dtype=dtype,
+                # TODO
+                # numeric_cols = list(
+                #     ns.select(self.aux_table, ns.Selector.NUMERIC).columns
+                # )
+                # if set(res["columns"]).issubset(numeric_cols):
+                #     dtype = float
+                # else:
+                #     dtype = object
+                pred = ns.make_dataframe(
+                    {c: [None for _ in range(res["shape"][0])] for c in res["columns"]},
                 )
                 new_res["predictions"] = pred
                 failed_columns.extend(res["columns"])
@@ -351,13 +362,16 @@ def _get_estimator_assignments(self):
         When the estimator does not handle multi-output, an estimator is fitted
         separately to each column.
         """
-        aux_table = self.aux_table.drop(self._aux_key, axis=1)
+        aux_table = self.aux_table.__dataframe_consortium_standard__().drop_columns(
+            *self._aux_key
+        )
+        ns, _ = get_df_namespace(aux_table.dataframe)
         assignments = []
-        regression_table = aux_table.select_dtypes("number")
+        regression_table = ns.select(aux_table.dataframe, ns.Selector.NUMERIC)
         assignments.extend(
             _get_assignments_for_estimator(regression_table, self.regressor_)
         )
-        classification_table = aux_table.select_dtypes(["object", "string", "category"])
+        classification_table = ns.select(aux_table.dataframe, ns.Selector.CATEGORICAL)
         assignments.extend(
             _get_assignments_for_estimator(classification_table, self.classifier_)
         )
@@ -373,15 +387,19 @@ def _get_assignments_for_estimator(table, estimator):
     # estimator is empty (eg the estimator is the regressor and there are no
     # numerical columns), return an empty list -- no columns are assigned to
     # that estimator.
-    if table.empty:
+    table = table.__dataframe_consortium_standard__()
+    if not len(table.column_names):
         return []
     if not _handles_multioutput(estimator):
-        return [{"columns": [col], "estimator": estimator} for col in table.columns]
-    columns_with_nulls = table.columns[table.isnull().any()]
+        return [
+            {"columns": [col], "estimator": estimator} for col in table.column_names
+        ]
+    table = table.persist()
+    columns_with_nulls = [c for c in table.column_names if table.col(c).is_null().any()]
     assignments = [
         {"columns": [col], "estimator": estimator} for col in columns_with_nulls
     ]
-    columns_without_nulls = list(set(table.columns).difference(columns_with_nulls))
+    columns_without_nulls = list(set(table.column_names).difference(columns_with_nulls))
     if columns_without_nulls:
         assignments.append({"columns": columns_without_nulls, "estimator": estimator})
     return assignments
@@ -392,14 +410,21 @@ def _handles_multioutput(estimator):
 
 
 def _fit(key_values, target_table, estimator, propagate_exceptions):
+    target_table = target_table.__dataframe_consortium_standard__()
     estimator = clone(estimator)
-    kept_rows = target_table.notnull().all(axis=1).to_numpy()
+    ns, _ = get_df_namespace(target_table.dataframe)
+    kept_rows = ~(
+        ns.any_rowwise(target_table.is_null().dataframe)
+        .__column_consortium_standard__()
+        .to_array()
+    )
     key_values = key_values[kept_rows]
-    Y = target_table.to_numpy()[kept_rows]
+    target_table = target_table.persist()
+    Y = target_table.to_array(None)[kept_rows]
 
     # Estimators that expect a single output issue a DataConversionWarning if
     # passing a column vector rather than a 1-D array
-    if len(target_table.columns) == 1:
+    if len(target_table.column_names) == 1:
         Y = Y.ravel()
     failed = False
     try:
@@ -409,10 +434,14 @@ def _fit(key_values, target_table, estimator, propagate_exceptions):
             raise
         failed = True
         estimator = None
-    return {"columns": target_table.columns, "estimator": estimator, "failed": failed}
+    return {
+        "columns": target_table.column_names,
+        "estimator": estimator,
+        "failed": failed,
+    }
 
 
-def _predict(key_values, columns, estimator, propagate_exceptions):
+def _predict(key_values, columns, estimator, propagate_exceptions, ns):
     failed = False
     try:
         Y_values = estimator.predict(key_values)
@@ -423,7 +452,9 @@ def _predict(key_values, columns, estimator, propagate_exceptions):
     if failed:
         predictions = None
     else:
-        predictions = pd.DataFrame(data=Y_values, columns=columns)
+        predictions = ns.make_dataframe(
+            dict(zip(columns, np.atleast_2d(Y_values.T).T.T))
+        )
     return {
         "predictions": predictions,
         "failed": failed,
@@ -437,5 +468,8 @@ def _add_column_name_suffix(dataframes, suffix):
         return dataframes
     renamed = []
     for df in dataframes:
-        renamed.append(df.rename(columns={c: f"{c}{suffix}" for c in df.columns}))
+        df = df.__dataframe_consortium_standard__()
+        renamed.append(
+            df.rename_columns({c: f"{c}{suffix}" for c in df.column_names}).dataframe
+        )
     return renamed
diff --git a/skrub/_join_utils.py b/skrub/_join_utils.py
index 7d4dd3432..6a92b5964 100644
--- a/skrub/_join_utils.py
+++ b/skrub/_join_utils.py
@@ -61,7 +61,8 @@ def check_missing_columns(table, key, table_name):
     table_name : str
         Name by which to refer to `table` in the error message if necessary.
     """
-    missing_columns = set(key) - set(table.columns)
+    table = table.__dataframe_consortium_standard__()
+    missing_columns = set(key) - set(table.column_names)
     if not missing_columns:
         return
     raise ValueError(

From cd3e9423a2cf8b7aab78b0f7c78d18fd6271e23e Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 14:26:20 +0100
Subject: [PATCH 02/17] store schema rather than columns

---
 skrub/_interpolation_joiner.py | 43 +++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index 8d5862f3a..1aa986b06 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -125,8 +125,8 @@ class InterpolationJoiner(TransformerMixin, BaseEstimator):
     estimators_ : list of dicts
         The estimators used to infer values to be joined. Each entry in this
         list is a dictionary with keys ``"estimator"`` (the fitted estimator)
-        and ``"columns"`` (the list of columns in ``aux_table`` that it is
-        trained to predict).
+        and ``"schema"`` (the names and data types of columns in ``aux_table``
+        that it is trained to predict).
 
     See Also
     --------
@@ -230,7 +230,7 @@ def fit(self, X, y=None):
         fit_results = joblib.Parallel(self.n_jobs)(
             joblib.delayed(_fit)(
                 key_values,
-                aux_table.select(*assignment["columns"]).dataframe,
+                aux_table.select(*assignment["schema"].keys()).dataframe,
                 assignment["estimator"],
                 propagate_exceptions=(self.on_estimator_failure == "raise"),
             )
@@ -258,7 +258,7 @@ def _check_fit_results(self, results):
         failed_columns = []
         for res in results:
             if res["failed"]:
-                failed_columns.extend(res["columns"])
+                failed_columns.extend(res["schema"].keys())
         if not failed_columns:
             return successful_results
         warnings.warn(
@@ -301,7 +301,7 @@ def transform(self, X):
         prediction_results = joblib.Parallel(self.n_jobs)(
             joblib.delayed(_predict)(
                 key_values,
-                assignment["columns"],
+                assignment["schema"],
                 assignment["estimator"],
                 propagate_exceptions=(self.on_estimator_failure == "raise"),
                 ns=get_df_namespace(self.aux_table)[0],
@@ -330,10 +330,13 @@ def _check_prediction_results(self, results):
                 # else:
                 #     dtype = object
                 pred = ns.make_dataframe(
-                    {c: [None for _ in range(res["shape"][0])] for c in res["columns"]},
+                    {
+                        c: [None for _ in range(res["shape"][0])]
+                        for c in res["schema"].keys()
+                    },
                 )
                 new_res["predictions"] = pred
-                failed_columns.extend(res["columns"])
+                failed_columns.extend(res["schema"].keys())
             checked_results.append(new_res)
         if not failed_columns:
             return checked_results
@@ -350,7 +353,7 @@ def _get_estimator_assignments(self):
 
         In many cases, a single estimator cannot handle all the target columns.
         This function groups columns that can be handled together and returns a
-        list of dictionaries, each with keys "columns" and "estimator".
+        list of dictionaries, each with keys "schema" and "estimator".
 
         Regression and classification targets are always handled separately.
 
@@ -392,16 +395,23 @@ def _get_assignments_for_estimator(table, estimator):
         return []
     if not _handles_multioutput(estimator):
         return [
-            {"columns": [col], "estimator": estimator} for col in table.column_names
+            {"schema": {col: table.schema[col]}, "estimator": estimator}
+            for col in table.column_names
         ]
     table = table.persist()
     columns_with_nulls = [c for c in table.column_names if table.col(c).is_null().any()]
     assignments = [
-        {"columns": [col], "estimator": estimator} for col in columns_with_nulls
+        {"schema": {col: table.schema[col]}, "estimator": estimator}
+        for col in columns_with_nulls
     ]
     columns_without_nulls = list(set(table.column_names).difference(columns_with_nulls))
     if columns_without_nulls:
-        assignments.append({"columns": columns_without_nulls, "estimator": estimator})
+        assignments.append(
+            {
+                "schema": {c: table.schema[c] for c in columns_without_nulls},
+                "estimator": estimator,
+            }
+        )
     return assignments
 
 
@@ -435,13 +445,13 @@ def _fit(key_values, target_table, estimator, propagate_exceptions):
         failed = True
         estimator = None
     return {
-        "columns": target_table.column_names,
+        "schema": target_table.schema,
         "estimator": estimator,
         "failed": failed,
     }
 
 
-def _predict(key_values, columns, estimator, propagate_exceptions, ns):
+def _predict(key_values, schema, estimator, propagate_exceptions, ns):
     failed = False
     try:
         Y_values = estimator.predict(key_values)
@@ -452,14 +462,15 @@ def _predict(key_values, columns, estimator, propagate_exceptions, ns):
     if failed:
         predictions = None
     else:
+        # TODO use schema
         predictions = ns.make_dataframe(
-            dict(zip(columns, np.atleast_2d(Y_values.T).T.T))
+            dict(zip(schema.keys(), np.atleast_2d(Y_values.T).T.T))
         )
     return {
         "predictions": predictions,
         "failed": failed,
-        "columns": columns,
-        "shape": (key_values.shape[0], len(columns)),
+        "schema": schema,
+        "shape": (key_values.shape[0], len(schema)),
     }
 
 

From 3c120b6b1c4f95febd5d323d8590e64aec18d7aa Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 15:13:30 +0100
Subject: [PATCH 03/17] failed results with right schema

---
 skrub/_dataframe/__init__.py   |  5 ++-
 skrub/_dataframe/_common.py    | 11 +++++
 skrub/_dataframe/_namespace.py |  5 +++
 skrub/_interpolation_joiner.py | 73 +++++++++++++++-------------------
 4 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py
index 6dec0a176..3ad5b14d7 100644
--- a/skrub/_dataframe/__init__.py
+++ b/skrub/_dataframe/__init__.py
@@ -1,3 +1,4 @@
-from ._namespace import get_df_namespace
+from ._common import std, stdns
+from ._namespace import get_df_namespace, skrubns
 
-__all__ = ["get_df_namespace"]
+__all__ = ["get_df_namespace", "skrubns", "std", "stdns"]
diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
index 8d7c6a662..814592238 100644
--- a/skrub/_dataframe/_common.py
+++ b/skrub/_dataframe/_common.py
@@ -7,3 +7,14 @@ class Selector(enum.Enum):
     NUMERIC = enum.auto()
     CATEGORICAL = enum.auto()
     STRING = enum.auto()
+
+
+def std(obj):
+    try:
+        return obj.__dataframe_consortium_standard__()
+    except AttributeError:
+        return obj.__column_consortium_standard__()
+
+
+def stdns(dataframe):
+    return dataframe.__dataframe_consortium_standard__().__dataframe_namespace__()
diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py
index 06c65a2ea..b6c3f6972 100644
--- a/skrub/_dataframe/_namespace.py
+++ b/skrub/_dataframe/_namespace.py
@@ -97,3 +97,8 @@ def get_df_namespace(*dfs):
                 "Only Pandas or Polars dataframes are currently supported, "
                 f"got {modules=!r}."
             )
+
+
+def skrubns(*dataframes):
+    ns, _ = get_df_namespace(*dataframes)
+    return ns
diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index 1aa986b06..04aff393d 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -1,7 +1,7 @@
+import itertools
 import warnings
 
 import joblib
-import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.ensemble import (
     HistGradientBoostingClassifier,
@@ -10,7 +10,7 @@
 from sklearn.utils._tags import _safe_tags
 
 from skrub import _join_utils, _utils
-from skrub._dataframe import get_df_namespace
+from skrub._dataframe import skrubns, std, stdns
 from skrub._minhash_encoder import MinHashEncoder
 from skrub._table_vectorizer import TableVectorizer
 
@@ -221,9 +221,9 @@ def fit(self, X, y=None):
         self._check_inputs()
         if X is not None:
             _join_utils.check_missing_columns(X, self._main_key, "'X' (the main table)")
-        aux_table = self.aux_table.__dataframe_consortium_standard__()
+        aux_table = std(self.aux_table)
         # TODO avoid conversion to pandas when TableVectorizer supports it
-        ns, _ = get_df_namespace(self.aux_table)
+        ns = skrubns(self.aux_table)
         df = ns.to_pandas(aux_table.select(*self._aux_key).dataframe)
         key_values = self.vectorizer_.fit_transform(df)
         estimators = self._get_estimator_assignments()
@@ -285,7 +285,7 @@ def transform(self, X):
             The result of the join between `X` and inferred rows from
             ``self.aux_table``.
         """
-        main_table = X.__dataframe_consortium_standard__()
+        main_table = std(X)
         _join_utils.check_missing_columns(
             main_table.dataframe, self._main_key, "'X' (the main table)"
         )
@@ -295,8 +295,7 @@ def transform(self, X):
             .dataframe
         )
         # TODO avoid conversion to pandas when vectorizer supports it
-        ns, _ = get_df_namespace(df)
-        df = ns.to_pandas(df)
+        df = skrubns(df).to_pandas(df)
         key_values = self.vectorizer_.transform(df)
         prediction_results = joblib.Parallel(self.n_jobs)(
             joblib.delayed(_predict)(
@@ -304,36 +303,29 @@ def transform(self, X):
                 assignment["schema"],
                 assignment["estimator"],
                 propagate_exceptions=(self.on_estimator_failure == "raise"),
-                ns=get_df_namespace(self.aux_table)[0],
+                api_ns=stdns(self.aux_table),
             )
             for assignment in self.estimators_
         )
         prediction_results = self._check_prediction_results(prediction_results)
         predictions = [res["predictions"] for res in prediction_results]
         predictions = _add_column_name_suffix(predictions, self.suffix)
-        ns, _ = get_df_namespace(self.aux_table)
-        return ns.concatenate(main_table.dataframe, *predictions)
+        return skrubns(self.aux_table).concatenate(main_table.dataframe, *predictions)
 
     def _check_prediction_results(self, results):
         checked_results = []
         failed_columns = []
-        ns, _ = get_df_namespace(self.aux_table)
+        api_ns = stdns(self.aux_table)
         for res in results:
             new_res = dict(**res)
             if res["failed"]:
-                # TODO
-                # numeric_cols = list(
-                #     ns.select(self.aux_table, ns.Selector.NUMERIC).columns
-                # )
-                # if set(res["columns"]).issubset(numeric_cols):
-                #     dtype = float
-                # else:
-                #     dtype = object
-                pred = ns.make_dataframe(
-                    {
-                        c: [None for _ in range(res["shape"][0])]
-                        for c in res["schema"].keys()
-                    },
+                pred = api_ns.dataframe_from_columns(
+                    *[
+                        api_ns.column_from_sequence(
+                            itertools.repeat(None, res["shape"][0]), name=c, dtype=dt
+                        )
+                        for c, dt in res["schema"].items()
+                    ],
                 )
                 new_res["predictions"] = pred
                 failed_columns.extend(res["schema"].keys())
@@ -365,10 +357,8 @@ def _get_estimator_assignments(self):
         When the estimator does not handle multi-output, an estimator is fitted
         separately to each column.
         """
-        aux_table = self.aux_table.__dataframe_consortium_standard__().drop_columns(
-            *self._aux_key
-        )
-        ns, _ = get_df_namespace(aux_table.dataframe)
+        aux_table = std(self.aux_table).drop_columns(*self._aux_key)
+        ns = skrubns(aux_table.dataframe)
         assignments = []
         regression_table = ns.select(aux_table.dataframe, ns.Selector.NUMERIC)
         assignments.extend(
@@ -390,7 +380,7 @@ def _get_assignments_for_estimator(table, estimator):
     # estimator is empty (eg the estimator is the regressor and there are no
     # numerical columns), return an empty list -- no columns are assigned to
     # that estimator.
-    table = table.__dataframe_consortium_standard__()
+    table = std(table)
     if not len(table.column_names):
         return []
     if not _handles_multioutput(estimator):
@@ -420,14 +410,10 @@ def _handles_multioutput(estimator):
 
 
 def _fit(key_values, target_table, estimator, propagate_exceptions):
-    target_table = target_table.__dataframe_consortium_standard__()
+    target_table = std(target_table)
     estimator = clone(estimator)
-    ns, _ = get_df_namespace(target_table.dataframe)
-    kept_rows = ~(
-        ns.any_rowwise(target_table.is_null().dataframe)
-        .__column_consortium_standard__()
-        .to_array()
-    )
+    ns = skrubns(target_table.dataframe)
+    kept_rows = ~(std(ns.any_rowwise(target_table.is_null().dataframe)).to_array())
     key_values = key_values[kept_rows]
     target_table = target_table.persist()
     Y = target_table.to_array(None)[kept_rows]
@@ -451,7 +437,7 @@ def _fit(key_values, target_table, estimator, propagate_exceptions):
     }
 
 
-def _predict(key_values, schema, estimator, propagate_exceptions, ns):
+def _predict(key_values, schema, estimator, propagate_exceptions, api_ns):
     failed = False
     try:
         Y_values = estimator.predict(key_values)
@@ -462,10 +448,13 @@ def _predict(key_values, schema, estimator, propagate_exceptions, ns):
     if failed:
         predictions = None
     else:
-        # TODO use schema
-        predictions = ns.make_dataframe(
-            dict(zip(schema.keys(), np.atleast_2d(Y_values.T).T.T))
-        )
+        if Y_values.ndim == 1:
+            Y_values = Y_values[:, None]
+        cols = [
+            api_ns.column_from_1d_array(y.astype(type(y[0])), name=c, dtype=dt)
+            for y, (c, dt) in zip(Y_values.T, schema.items())
+        ]
+        predictions = api_ns.dataframe_from_columns(*cols).dataframe
     return {
         "predictions": predictions,
         "failed": failed,
@@ -479,7 +468,7 @@ def _add_column_name_suffix(dataframes, suffix):
         return dataframes
     renamed = []
     for df in dataframes:
-        df = df.__dataframe_consortium_standard__()
+        df = std(df)
         renamed.append(
             df.rename_columns({c: f"{c}{suffix}" for c in df.column_names}).dataframe
         )

From 9e5630564889da8dc9d98bfb5c29a0814e112734 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 15:23:02 +0100
Subject: [PATCH 04/17] fix preserving pandas index

---
 skrub/_interpolation_joiner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index 04aff393d..d02f6900e 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -310,7 +310,7 @@ def transform(self, X):
         prediction_results = self._check_prediction_results(prediction_results)
         predictions = [res["predictions"] for res in prediction_results]
         predictions = _add_column_name_suffix(predictions, self.suffix)
-        return skrubns(self.aux_table).concatenate(main_table.dataframe, *predictions)
+        return skrubns(self.aux_table).concatenate(X, *predictions)
 
     def _check_prediction_results(self, results):
         checked_results = []
@@ -326,7 +326,7 @@ def _check_prediction_results(self, results):
                         )
                         for c, dt in res["schema"].items()
                     ],
-                )
+                ).dataframe
                 new_res["predictions"] = pred
                 failed_columns.extend(res["schema"].keys())
             checked_results.append(new_res)

From b0b7c8990b1b95f74b0b619f2d06b37ef416a382 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 15:44:42 +0100
Subject: [PATCH 05/17] update tests

---
 skrub/tests/test_interpolation_join.py | 93 ++++++++++++++------------
 skrub/tests/test_select_cols.py        | 13 +---
 2 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/skrub/tests/test_interpolation_join.py b/skrub/tests/test_interpolation_join.py
index feee706c8..a73211d95 100644
--- a/skrub/tests/test_interpolation_join.py
+++ b/skrub/tests/test_interpolation_join.py
@@ -9,15 +9,15 @@
 
 
 @pytest.fixture
-def buildings():
-    return pd.DataFrame(
+def buildings(px):
+    return px.DataFrame(
         {"latitude": [1.0, 2.0], "longitude": [1.0, 2.0], "n_stories": [3, 7]}
     )
 
 
 @pytest.fixture
-def weather():
-    return pd.DataFrame(
+def weather(px):
+    return px.DataFrame(
         {
             "latitude": [1.2, 0.9, 1.9, 1.7, 5.0, 5.0],
             "longitude": [0.8, 1.1, 1.8, 1.8, 5.0, 5.0],
@@ -31,20 +31,21 @@ def weather():
 @pytest.mark.parametrize("with_nulls", [False, True])
 def test_interpolation_join(buildings, weather, key, with_nulls):
     if not with_nulls:
-        weather = weather.fillna(0.0)
+        weather = weather.__dataframe_consortium_standard__().fill_null(0.0).dataframe
     transformed = InterpolationJoiner(
         weather,
         key=key,
         regressor=KNeighborsRegressor(2),
         classifier=KNeighborsClassifier(2),
+        suffix="_w",
     ).fit_transform(buildings)
-    assert_array_equal(transformed["avg_temp"].values, [10.5, 15.5])
-    assert_array_equal(transformed["climate"].values, ["A", "B"])
+    assert_array_equal(transformed["avg_temp_w"].to_numpy(), [10.5, 15.5])
+    assert_array_equal(transformed["climate_w"].to_numpy(), ["A", "B"])
 
 
-def test_vectorizer():
-    main = pd.DataFrame({"A": [0, 1]})
-    aux = pd.DataFrame({"A": [11, 110], "B": [1, 0]})
+def test_vectorizer(px):
+    main = px.DataFrame({"A": [0, 1]})
+    aux = px.DataFrame({"A": [11, 110], "B": [1, 0]})
 
     class Vectorizer(TransformerMixin, BaseEstimator):
         def fit(self, X):
@@ -71,18 +72,18 @@ def test_no_multioutput(buildings, weather):
     assert transformed.shape == (2, 5)
 
 
-def test_condition_choice():
-    main = pd.DataFrame({"A": [0, 1, 2]})
-    aux = pd.DataFrame({"A": [0, 1, 2], "rB": [2, 0, 1], "C": [10, 11, 12]})
+def test_condition_choice(px):
+    main = px.DataFrame({"A": [0, 1, 2]})
+    aux = px.DataFrame({"A": [0, 1, 2], "rB": [2, 0, 1], "C": [10, 11, 12]})
     join = InterpolationJoiner(
         aux, key="A", regressor=KNeighborsRegressor(1)
     ).fit_transform(main)
-    assert_array_equal(join["C"].values, [10, 11, 12])
+    assert_array_equal(join["C"].to_numpy(), [10, 11, 12])
 
     join = InterpolationJoiner(
-        aux, main_key="A", aux_key="rB", regressor=KNeighborsRegressor(1)
+        aux, main_key="A", aux_key="rB", regressor=KNeighborsRegressor(1), suffix="_r"
     ).fit_transform(main)
-    assert_array_equal(join["C"].values, [11, 12, 10])
+    assert_array_equal(join["C_r"].to_numpy(), [11, 12, 10])
 
     with pytest.raises(ValueError, match="Must pass EITHER"):
         join = InterpolationJoiner(
@@ -100,8 +101,8 @@ def test_condition_choice():
         ).fit(None)
 
 
-def test_suffix():
-    df = pd.DataFrame({"A": [0, 1], "B": [0, 1]})
+def test_suffix(px):
+    df = px.DataFrame({"A": [0, 1], "B": [0, 1]})
     join = InterpolationJoiner(
         df, key="A", suffix="_aux", regressor=KNeighborsRegressor(1)
     ).fit_transform(df)
@@ -114,25 +115,28 @@ def test_mismatched_indexes():
     join = InterpolationJoiner(
         aux, key="A", regressor=KNeighborsRegressor(1)
     ).fit_transform(main)
-    assert_array_equal(join["B"].values, [10, 11])
-    assert_array_equal(join.index.values, [1, 0])
+    assert_array_equal(join["B"].to_numpy(), [10, 11])
+    assert_array_equal(join.index.to_numpy(), [1, 0])
 
 
-def test_fit_on_none():
+def test_fit_on_none(px):
     # X is hardly used in fit so it should be ok to fit without a main table
-    aux = pd.DataFrame({"A": [0, 1], "B": [10, 11]})
+    aux = px.DataFrame({"A": [0, 1], "B": [10, 11]})
     joiner = InterpolationJoiner(aux, key="A", regressor=KNeighborsRegressor(1)).fit(
         None
     )
-    main = pd.DataFrame({"A": [0, 1]}, index=[1, 0])
+    main = px.DataFrame({"A": [0, 1]})
+    if isinstance(main, pd.DataFrame):
+        main.index = [1, 0]
     join = joiner.transform(main)
-    assert_array_equal(join["B"].values, [10, 11])
-    assert_array_equal(join.index.values, [1, 0])
+    assert_array_equal(join["B"].to_numpy(), [10, 11])
+    if isinstance(main, pd.DataFrame):
+        assert_array_equal(join.index.to_numpy(), [1, 0])
 
 
-def test_join_on_date():
-    sales = pd.DataFrame({"date": ["2023-09-20", "2023-09-29"], "n": [10, 15]})
-    temp = pd.DataFrame(
+def test_join_on_date(px):
+    sales = px.DataFrame({"date": ["2023-09-20", "2023-09-29"], "n": [10, 15]})
+    temp = px.DataFrame(
         {"date": ["2023-09-09", "2023-10-01", "2024-09-21"], "temp": [-10, 10, 30]}
     )
     transformed = (
@@ -145,7 +149,7 @@ def test_join_on_date():
         .set_params(vectorizer__datetime_transformer__resolution=None)
         .fit_transform(sales)
     )
-    assert_array_equal(transformed["temp"].values, [-10, 10])
+    assert_array_equal(transformed["temp"].to_numpy(), [-10, 10])
 
 
 class FailFit(DummyClassifier):
@@ -154,7 +158,12 @@ def fit(self, X, y):
 
 
 def test_fit_failures(buildings, weather):
-    weather["climate"] = "A"
+    try:
+        weather["climate"] = "A"
+    except TypeError:
+        import polars as pl
+
+        weather = weather.with_columns(pl.lit("A").alias("climate"))
     joiner = InterpolationJoiner(
         weather,
         key=["latitude", "longitude"],
@@ -163,7 +172,7 @@ def test_fit_failures(buildings, weather):
         on_estimator_failure="pass",
     )
     join = joiner.fit_transform(buildings)
-    assert_array_equal(join["avg_temp"].values, [10.5, 15.5])
+    assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5])
     assert join.shape == (2, 4)
 
     joiner = InterpolationJoiner(
@@ -175,7 +184,7 @@ def test_fit_failures(buildings, weather):
     )
     with pytest.warns(UserWarning, match="(?s)Estimators failed.*climate"):
         join = joiner.fit_transform(buildings)
-    assert_array_equal(join["avg_temp"].values, [10.5, 15.5])
+    assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5])
     assert join.shape == (2, 4)
 
     joiner = InterpolationJoiner(
@@ -203,9 +212,9 @@ def test_transform_failures(buildings, weather):
         on_estimator_failure="pass",
     )
     join = joiner.fit_transform(buildings)
-    assert_array_equal(join["avg_temp"].values, [10.5, 15.5])
-    assert join["climate"].isnull().all()
-    assert join["climate"].dtype == object
+    assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5])
+    assert join["climate"].__column_consortium_standard__().is_null().all()
+    assert join["climate"].dtype == weather["climate"].dtype
     assert join.shape == (2, 5)
 
     joiner = InterpolationJoiner(
@@ -217,9 +226,9 @@ def test_transform_failures(buildings, weather):
     )
     with pytest.warns(UserWarning, match="(?s)Prediction failed.*climate"):
         join = joiner.fit_transform(buildings)
-    assert_array_equal(join["avg_temp"].values, [10.5, 15.5])
-    assert join["climate"].isnull().all()
-    assert join["climate"].dtype == object
+    assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5])
+    assert join["climate"].__column_consortium_standard__().is_null().all()
+    assert join["climate"].dtype == weather["climate"].dtype
     assert join.shape == (2, 5)
 
     joiner = InterpolationJoiner(
@@ -242,8 +251,8 @@ def test_transform_failures_dtype(buildings, weather):
         on_estimator_failure="pass",
     )
     join = joiner.fit_transform(buildings)
-    assert join["avg_temp"].isnull().all()
-    assert join["avg_temp"].dtype == "float64"
+    assert join["avg_temp"].__column_consortium_standard__().is_null().all()
+    assert join["avg_temp"].dtype == join["avg_temp"].dtype
     assert join.shape == (2, 5)
 
     joiner = InterpolationJoiner(
@@ -254,6 +263,6 @@ def test_transform_failures_dtype(buildings, weather):
         on_estimator_failure="pass",
     )
     join = joiner.fit_transform(buildings)
-    assert join["climate"].isnull().all()
-    assert join["climate"].dtype == object
+    assert join["climate"].__column_consortium_standard__().is_null().all()
+    assert join["climate"].dtype == weather["climate"].dtype
     assert join.shape == (2, 5)
diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py
index 3ab07e590..4b08a255c 100644
--- a/skrub/tests/test_select_cols.py
+++ b/skrub/tests/test_select_cols.py
@@ -3,18 +3,11 @@
 import pytest
 
 from skrub import DropCols, SelectCols
-from skrub._dataframe._polars import POLARS_SETUP
 
-DATAFRAME_MODULES = [pandas]
-if POLARS_SETUP:
-    import polars
 
-    DATAFRAME_MODULES.append(polars)
-
-
-@pytest.fixture(params=DATAFRAME_MODULES)
-def df(request):
-    return request.param.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]})
+@pytest.fixture
+def df(px):
+    return px.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]})
 
 
 def test_select_cols(df):

From adda6e45b54a075ad58adeec8a612d30a0ee0d10 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 15:45:36 +0100
Subject: [PATCH 06/17] add conftest

---
 skrub/tests/conftest.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 skrub/tests/conftest.py

diff --git a/skrub/tests/conftest.py b/skrub/tests/conftest.py
new file mode 100644
index 000000000..d142f8b9c
--- /dev/null
+++ b/skrub/tests/conftest.py
@@ -0,0 +1,15 @@
+import pandas
+import pytest
+
+DATAFRAME_MODULES = [pandas]
+try:
+    import polars
+
+    DATAFRAME_MODULES.append(polars)
+except ImportError:
+    pass
+
+
+@pytest.fixture(params=DATAFRAME_MODULES)
+def px(request):
+    return request.param

From 668b8490a3aa332dc47ee3488b9145aefb2966fc Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 15:53:05 +0100
Subject: [PATCH 07/17] add dataframe_api_compat

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 4abe5b61e..099531f2b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     scipy>=1.9.3
     pandas>=1.5.3
     packaging>=23.1
+    dataframe_api_compat>=0.1.28
 python_requires = >=3.10
 
 [options.extras_require]

From 6729905bb8c9651708f34cff42278c5cbe5263c1 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 15 Nov 2023 16:09:41 +0100
Subject: [PATCH 08/17] update doctests

---
 setup.cfg                      | 2 +-
 skrub/_interpolation_joiner.py | 1 +
 skrub/_joiner.py               | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 099531f2b..f128297c5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,7 +30,7 @@ install_requires =
     scipy>=1.9.3
     pandas>=1.5.3
     packaging>=23.1
-    dataframe_api_compat>=0.1.28
+    dataframe-api-compat>=0.1.28
 python_requires = >=3.10
 
 [options.extras_require]
diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index d02f6900e..151eecaa4 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -136,6 +136,7 @@ class InterpolationJoiner(TransformerMixin, BaseEstimator):
 
     Examples
     --------
+    >>> import pandas as pd
     >>> buildings = pd.DataFrame(
     ...     {"latitude": [1.0, 2.0], "longitude": [1.0, 2.0], "n_stories": [3, 7]}
     ... )
diff --git a/skrub/_joiner.py b/skrub/_joiner.py
index 77a48b70d..c3b1c3f91 100644
--- a/skrub/_joiner.py
+++ b/skrub/_joiner.py
@@ -78,6 +78,7 @@ class Joiner(TransformerMixin, BaseEstimator):
 
     Examples
     --------
+    >>> import pandas as pd
     >>> X = pd.DataFrame(['France', 'Germany', 'Italy'], columns=['Country'])
     >>> X
        Country

From 9736d0d9b9ad878972f018b2785db0d5eb55ed11 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Thu, 16 Nov 2023 16:39:17 +0100
Subject: [PATCH 09/17] bump pandas version to have dataframe api support

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f128297c5..4cc6490d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,7 +28,7 @@ install_requires =
     scikit-learn>=1.2.1
     numpy>=1.23.5
     scipy>=1.9.3
-    pandas>=1.5.3
+    pandas>=2.1.0
     packaging>=23.1
     dataframe-api-compat>=0.1.28
 python_requires = >=3.10

From 418f3f36a00b79bfca97252a26975c39365d3450 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Thu, 16 Nov 2023 16:43:14 +0100
Subject: [PATCH 10/17] update setup.cfg

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 4cc6490d1..a9a20c3fb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -80,7 +80,7 @@ min-py310 =
     scikit-learn==1.2.1
     numpy==1.23.5
     scipy==1.9.3
-    pandas==1.5.3
+    pandas==2.1.0
 
 [flake8]
 # max line length for black

From b5f2c5687fc576c4542e352c7df5b51f327fb884 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Thu, 16 Nov 2023 18:04:39 +0100
Subject: [PATCH 11/17] upgrade scikit-learn

---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index a9a20c3fb..b7fc345f2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -25,7 +25,7 @@ project_urls =
 include_package_data = True
 packages = find:
 install_requires =
-    scikit-learn>=1.2.1
+    scikit-learn>=1.3.0
     numpy>=1.23.5
     scipy>=1.9.3
     pandas>=2.1.0
@@ -77,7 +77,7 @@ benchmarks =
 # Overwrite the previous install_requires for CI testing purposes
 # as defined in testing.yml.
 min-py310 =
-    scikit-learn==1.2.1
+    scikit-learn==1.3.0
     numpy==1.23.5
     scipy==1.9.3
     pandas==2.1.0

From b7b0a5904b478eff4f614105f2e803065ee1e2ba Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Thu, 16 Nov 2023 18:04:48 +0100
Subject: [PATCH 12/17] remove unused to_numpy

---
 skrub/_dataframe/_pandas.py | 5 -----
 skrub/_dataframe/_polars.py | 7 -------
 2 files changed, 12 deletions(-)

diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
index 499bdd446..4c439a492 100644
--- a/skrub/_dataframe/_pandas.py
+++ b/skrub/_dataframe/_pandas.py
@@ -23,7 +23,6 @@
     "concatenate",
     "any_rowwise",
     "collect",
-    "to_numpy",
     "to_pandas",
 ]
 
@@ -384,7 +383,3 @@ def collect(dataframe):
 
 def to_pandas(dataframe):
     return dataframe
-
-
-def to_numpy(dataframe):
-    return dataframe.to_numpy()
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
index d17af0dd4..1ff2b5e96 100644
--- a/skrub/_dataframe/_polars.py
+++ b/skrub/_dataframe/_polars.py
@@ -28,7 +28,6 @@
     "concatenate",
     "any_rowwise",
     "collect",
-    "to_numpy",
     "to_pandas",
 ]
 
@@ -328,9 +327,3 @@ def to_pandas(dataframe):
     if hasattr(dataframe, "collect"):
         dataframe = dataframe.collect()
     return dataframe.to_pandas()
-
-
-def to_numpy(dataframe):
-    if hasattr(dataframe, "to_numpy"):
-        return dataframe.to_numpy()
-    return dataframe.collect().to_numpy()

From c8b24f3f92b6db7db3f5c8dd502e30006dbf8d5b Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Fri, 17 Nov 2023 11:22:29 +0100
Subject: [PATCH 13/17] add more tests

---
 .github/workflows/testing.yml          |  1 +
 skrub/tests/conftest.py => conftest.py |  0
 skrub/_dataframe/__init__.py           |  4 +--
 skrub/_dataframe/_common.py            |  1 -
 skrub/_dataframe/_pandas.py            |  9 +-----
 skrub/_dataframe/_polars.py            |  9 ++----
 skrub/_dataframe/tests/test_all.py     | 43 ++++++++++++++++++++++++++
 7 files changed, 50 insertions(+), 17 deletions(-)
 rename skrub/tests/conftest.py => conftest.py (100%)
 create mode 100644 skrub/_dataframe/tests/test_all.py

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 5f047366d..787f86757 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -75,6 +75,7 @@ jobs:
       - shell: bash {0}
         run: |
           cp $GITHUB_WORKSPACE/pyproject.toml .
+          cp $GITHUB_WORKSPACE/conftest.py .
           $GITHUB_WORKSPACE/build_tools/github/test.sh
         working-directory: ${{ runner.temp }}
         name: 'Run tests'
diff --git a/skrub/tests/conftest.py b/conftest.py
similarity index 100%
rename from skrub/tests/conftest.py
rename to conftest.py
diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py
index 3ad5b14d7..038e4ef1c 100644
--- a/skrub/_dataframe/__init__.py
+++ b/skrub/_dataframe/__init__.py
@@ -1,4 +1,4 @@
-from ._common import std, stdns
+from ._common import Selector, std, stdns
 from ._namespace import get_df_namespace, skrubns
 
-__all__ = ["get_df_namespace", "skrubns", "std", "stdns"]
+__all__ = ["get_df_namespace", "skrubns", "std", "stdns", "Selector"]
diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
index 814592238..bb95b61ba 100644
--- a/skrub/_dataframe/_common.py
+++ b/skrub/_dataframe/_common.py
@@ -6,7 +6,6 @@ class Selector(enum.Enum):
     NONE = enum.auto()
     NUMERIC = enum.auto()
     CATEGORICAL = enum.auto()
-    STRING = enum.auto()
 
 
 def std(obj):
diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
index 4c439a492..2faf940f2 100644
--- a/skrub/_dataframe/_pandas.py
+++ b/skrub/_dataframe/_pandas.py
@@ -22,7 +22,6 @@
     "Selector",
     "concatenate",
     "any_rowwise",
-    "collect",
     "to_pandas",
 ]
 
@@ -356,14 +355,12 @@ def select(dataframe, columns):
         return dataframe.select_dtypes("number")
     elif columns is Selector.CATEGORICAL:
         return dataframe.select_dtypes(["object", "string", "category"])
-    elif columns is Selector.STRING:
-        return dataframe.select_dtypes(["string"])
     # we have covered all items in the enumeration
     assert False
 
 
 def drop(dataframe, columns):
-    return dataframe.drop(select(dataframe, columns).columns.values)
+    return dataframe.drop(select(dataframe, columns).columns.values, axis=1)
 
 
 def any_rowwise(dataframe):
@@ -377,9 +374,5 @@ def concatenate(dataframe, *other_dataframes):
     return pd.concat([dataframe] + list(other_dataframes), axis=1)
 
 
-def collect(dataframe):
-    return dataframe
-
-
 def to_pandas(dataframe):
     return dataframe
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
index 1ff2b5e96..e179d0b31 100644
--- a/skrub/_dataframe/_polars.py
+++ b/skrub/_dataframe/_polars.py
@@ -27,7 +27,6 @@
     "Selector",
     "concatenate",
     "any_rowwise",
-    "collect",
     "to_pandas",
 ]
 
@@ -292,8 +291,6 @@ def _check_selector(columns):
         return cs.numeric()
     elif columns is Selector.CATEGORICAL:
         return cs.string(include_categorical=True)
-    elif columns is Selector.STRING:
-        return cs.string()
     # we have covered all items in the enumeration
     assert False
 
@@ -307,17 +304,17 @@ def drop(dataframe, columns):
 
 
 def any_rowwise(dataframe):
-    return collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any")
+    return _collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any")
 
 
 def concatenate(dataframe, *other_dataframes):
     return pl.concat(
-        [collect(dataframe)] + [collect(df) for df in other_dataframes],
+        [_collect(dataframe)] + [_collect(df) for df in other_dataframes],
         how="horizontal",
     )
 
 
-def collect(dataframe):
+def _collect(dataframe):
     if hasattr(dataframe, "collect"):
         dataframe = dataframe.collect()
     return dataframe
diff --git a/skrub/_dataframe/tests/test_all.py b/skrub/_dataframe/tests/test_all.py
new file mode 100644
index 000000000..d46140968
--- /dev/null
+++ b/skrub/_dataframe/tests/test_all.py
@@ -0,0 +1,43 @@
+import pytest
+
+from skrub._dataframe import Selector, skrubns
+
+
+@pytest.fixture
+def df(px):
+    return px.DataFrame(
+        {"ID": [2, 3, 7], "name": ["ab", "cd", "01"], "temp": [20.3, 40.9, 11.5]}
+    )
+
+
+def test_select(df):
+    ns = skrubns(df)
+    assert list(ns.select(df, []).columns) == []
+    assert list(ns.select(df, ["name"]).columns) == ["name"]
+    assert list(ns.select(df, Selector.ALL).columns) == list(df.columns)
+    assert list(ns.select(df, Selector.NONE).columns) == []
+    assert list(ns.select(df, Selector.NUMERIC).columns) == ["ID", "temp"]
+    assert list(ns.select(df, Selector.CATEGORICAL).columns) == ["name"]
+
+
+def test_drop(df):
+    ns = skrubns(df)
+    assert list(ns.drop(df, []).columns) == list(df.columns)
+    assert list(ns.drop(df, ["name"]).columns) == ["ID", "temp"]
+    assert list(ns.drop(df, Selector.ALL).columns) == []
+    assert list(ns.drop(df, Selector.NONE).columns) == list(df.columns)
+    assert list(ns.drop(df, Selector.NUMERIC).columns) == ["name"]
+    assert list(ns.drop(df, Selector.CATEGORICAL).columns) == ["ID", "temp"]
+
+
+def test_concatenate(df):
+    ns = skrubns(df)
+    df1 = (
+        df.__dataframe_consortium_standard__()
+        .rename_columns({c: f"{c}_1" for c in df.columns})
+        .dataframe
+    )
+    out = ns.concatenate(df)
+    assert list(out.columns) == list(df.columns)
+    out = ns.concatenate(df, df1)
+    assert list(out.columns) == list(df.columns) + list(df1.columns)

From a9ac96da20c6c990305ca106fdb86a50f12c09f9 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Fri, 17 Nov 2023 11:23:39 +0100
Subject: [PATCH 14/17] update changelog

---
 CHANGES.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index dd6726821..caf4351bf 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -17,7 +17,7 @@ Major changes
 
 * :class:`InterpolationJoiner` was added to join two tables by using
   machine-learning to infer the matching rows from the second table.
-  :pr:`742` by :user:`Jérôme Dockès <jeromedockes>`.
+  :pr:`742` and :pr:`827` by :user:`Jérôme Dockès <jeromedockes>`.
 
 * Pipelines including :class:`TableVectorizer` can now be grid-searched, since
   we can now call `set_params` on the default transformers of :class:`TableVectorizer`.

From 2e6e19cdeb200270a7a4d084be7c4584ad07d860 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Fri, 17 Nov 2023 11:28:11 +0100
Subject: [PATCH 15/17] column namespace

---
 skrub/_dataframe/_common.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
index bb95b61ba..78edd4e5a 100644
--- a/skrub/_dataframe/_common.py
+++ b/skrub/_dataframe/_common.py
@@ -15,5 +15,8 @@ def std(obj):
         return obj.__column_consortium_standard__()
 
 
-def stdns(dataframe):
-    return dataframe.__dataframe_consortium_standard__().__dataframe_namespace__()
+def stdns(obj):
+    try:
+        return obj.__dataframe_consortium_standard__().__dataframe_namespace__()
+    except AttributeError:
+        return obj.__column_consortium_standard__().__column_namespace__()

From 1be7187abe8ab95b67a01baf596495351a1c4d4f Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Fri, 17 Nov 2023 11:47:28 +0100
Subject: [PATCH 16/17] more tests

---
 skrub/_dataframe/tests/test_common.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 skrub/_dataframe/tests/test_common.py

diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py
new file mode 100644
index 000000000..182a664e8
--- /dev/null
+++ b/skrub/_dataframe/tests/test_common.py
@@ -0,0 +1,11 @@
+from skrub._dataframe import skrubns, std, stdns
+
+
+def test_std(px):
+    df = px.DataFrame({"A": [1, 2]})
+    assert hasattr(std(df), "dataframe")
+    assert hasattr(stdns(df), "dataframe_from_columns")
+    ns = skrubns(df)
+    s = ns.make_series([1, 2], name="A")
+    assert hasattr(std(s), "column")
+    assert hasattr(stdns(s), "dataframe_from_columns")

From c09f577a1e766e2657d94cf277ab6645a0e6ffb1 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Fri, 17 Nov 2023 11:58:25 +0100
Subject: [PATCH 17/17] rename concatenate -> concat_horizontal

---
 skrub/_dataframe/_pandas.py        | 4 ++--
 skrub/_dataframe/_polars.py        | 4 ++--
 skrub/_dataframe/tests/test_all.py | 6 +++---
 skrub/_interpolation_joiner.py     | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
index 2faf940f2..c64ebdff6 100644
--- a/skrub/_dataframe/_pandas.py
+++ b/skrub/_dataframe/_pandas.py
@@ -20,7 +20,7 @@
     "select",
     "drop",
     "Selector",
-    "concatenate",
+    "concat_horizontal",
     "any_rowwise",
     "to_pandas",
 ]
@@ -367,7 +367,7 @@ def any_rowwise(dataframe):
     return dataframe.any(axis=1)
 
 
-def concatenate(dataframe, *other_dataframes):
+def concat_horizontal(dataframe, *other_dataframes):
     other_dataframes = [
         df.set_axis(dataframe.index, axis="index") for df in other_dataframes
     ]
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
index e179d0b31..1b52ee6ea 100644
--- a/skrub/_dataframe/_polars.py
+++ b/skrub/_dataframe/_polars.py
@@ -25,7 +25,7 @@
     "select",
     "drop",
     "Selector",
-    "concatenate",
+    "concat_horizontal",
     "any_rowwise",
     "to_pandas",
 ]
@@ -307,7 +307,7 @@ def any_rowwise(dataframe):
     return _collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any")
 
 
-def concatenate(dataframe, *other_dataframes):
+def concat_horizontal(dataframe, *other_dataframes):
     return pl.concat(
         [_collect(dataframe)] + [_collect(df) for df in other_dataframes],
         how="horizontal",
diff --git a/skrub/_dataframe/tests/test_all.py b/skrub/_dataframe/tests/test_all.py
index d46140968..8c18fdbc5 100644
--- a/skrub/_dataframe/tests/test_all.py
+++ b/skrub/_dataframe/tests/test_all.py
@@ -30,14 +30,14 @@ def test_drop(df):
     assert list(ns.drop(df, Selector.CATEGORICAL).columns) == ["ID", "temp"]
 
 
-def test_concatenate(df):
+def test_concat_horizontal(df):
     ns = skrubns(df)
     df1 = (
         df.__dataframe_consortium_standard__()
         .rename_columns({c: f"{c}_1" for c in df.columns})
         .dataframe
     )
-    out = ns.concatenate(df)
+    out = ns.concat_horizontal(df)
     assert list(out.columns) == list(df.columns)
-    out = ns.concatenate(df, df1)
+    out = ns.concat_horizontal(df, df1)
     assert list(out.columns) == list(df.columns) + list(df1.columns)
diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
index 151eecaa4..f696dd4df 100644
--- a/skrub/_interpolation_joiner.py
+++ b/skrub/_interpolation_joiner.py
@@ -311,7 +311,7 @@ def transform(self, X):
         prediction_results = self._check_prediction_results(prediction_results)
         predictions = [res["predictions"] for res in prediction_results]
         predictions = _add_column_name_suffix(predictions, self.suffix)
-        return skrubns(self.aux_table).concatenate(X, *predictions)
+        return skrubns(self.aux_table).concat_horizontal(X, *predictions)
 
     def _check_prediction_results(self, results):
         checked_results = []